airbyte_cdk.sources.declarative.parsers.model_to_component_factory
1# 2# Copyright (c) 2025 Airbyte, Inc., all rights reserved. 3# 4 5from __future__ import annotations 6 7import datetime 8import importlib 9import inspect 10import logging 11import re 12from functools import partial 13from typing import ( 14 Any, 15 Callable, 16 Dict, 17 List, 18 Mapping, 19 MutableMapping, 20 Optional, 21 Tuple, 22 Type, 23 Union, 24 cast, 25 get_args, 26 get_origin, 27 get_type_hints, 28) 29 30from airbyte_protocol_dataclasses.models import ConfiguredAirbyteStream 31from isodate import parse_duration 32from pydantic.v1 import BaseModel 33from requests import Response 34 35from airbyte_cdk.connector_builder.models import ( 36 LogMessage as ConnectorBuilderLogMessage, 37) 38from airbyte_cdk.models import ( 39 AirbyteStateBlob, 40 AirbyteStateMessage, 41 AirbyteStateType, 42 AirbyteStreamState, 43 ConfiguredAirbyteCatalog, 44 FailureType, 45 Level, 46 StreamDescriptor, 47) 48from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager 49from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator 50from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker 51from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository 52from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus 53from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator 54from airbyte_cdk.sources.declarative.auth.declarative_authenticator import ( 55 DeclarativeAuthenticator, 56 NoAuth, 57) 58from airbyte_cdk.sources.declarative.auth.jwt import JwtAlgorithm 59from airbyte_cdk.sources.declarative.auth.oauth import ( 60 DeclarativeSingleUseRefreshTokenOauth2Authenticator, 61) 62from airbyte_cdk.sources.declarative.auth.selective_authenticator import SelectiveAuthenticator 63from airbyte_cdk.sources.declarative.auth.token import ( 64 ApiKeyAuthenticator, 65 BasicHttpAuthenticator, 66 BearerAuthenticator, 67 LegacySessionTokenAuthenticator, 68) 69from airbyte_cdk.sources.declarative.auth.token_provider import ( 70 InterpolatedStringTokenProvider, 71 SessionTokenProvider, 72 TokenProvider, 73) 74from airbyte_cdk.sources.declarative.checks import ( 75 CheckDynamicStream, 76 CheckStream, 77 DynamicStreamCheckConfig, 78) 79from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel 80from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime 81from airbyte_cdk.sources.declarative.decoders import ( 82 Decoder, 83 IterableDecoder, 84 JsonDecoder, 85 PaginationDecoderDecorator, 86 XmlDecoder, 87 ZipfileDecoder, 88) 89from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import ( 90 CompositeRawDecoder, 91 CsvParser, 92 GzipParser, 93 JsonLineParser, 94 JsonParser, 95 Parser, 96) 97from airbyte_cdk.sources.declarative.extractors import ( 98 DpathExtractor, 99 RecordFilter, 100 RecordSelector, 101 ResponseToFileExtractor, 102) 103from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor 104from airbyte_cdk.sources.declarative.extractors.record_filter import ( 105 ClientSideIncrementalRecordFilterDecorator, 106) 107from airbyte_cdk.sources.declarative.incremental import ( 108 ConcurrentCursorFactory, 109 ConcurrentPerPartitionCursor, 110) 111from airbyte_cdk.sources.declarative.interpolation import InterpolatedString 112from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping 113from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import ( 114 LegacyToPerPartitionStateMigration, 115) 116from airbyte_cdk.sources.declarative.models import ( 117 CustomStateMigration, 118 PaginationResetLimits, 119) 120from airbyte_cdk.sources.declarative.models.base_model_with_deprecations import ( 121 DEPRECATION_LOGS_TAG, 122 BaseModelWithDeprecations, 123) 124from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 125 Action1 as PaginationResetActionModel, 126) 127from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 128 AddedFieldDefinition as AddedFieldDefinitionModel, 129) 130from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 131 AddFields as AddFieldsModel, 132) 133from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 134 ApiKeyAuthenticator as ApiKeyAuthenticatorModel, 135) 136from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 137 AsyncJobStatusMap as AsyncJobStatusMapModel, 138) 139from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 140 AsyncRetriever as AsyncRetrieverModel, 141) 142from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 143 BasicHttpAuthenticator as BasicHttpAuthenticatorModel, 144) 145from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 146 BearerAuthenticator as BearerAuthenticatorModel, 147) 148from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 149 CheckDynamicStream as CheckDynamicStreamModel, 150) 151from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 152 CheckStream as CheckStreamModel, 153) 154from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 155 ComplexFieldType as ComplexFieldTypeModel, 156) 157from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 158 ComponentMappingDefinition as ComponentMappingDefinitionModel, 159) 160from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 161 CompositeErrorHandler as CompositeErrorHandlerModel, 162) 163from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 164 ConcurrencyLevel as ConcurrencyLevelModel, 165) 166from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 167 ConfigAddFields as ConfigAddFieldsModel, 168) 169from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 170 ConfigComponentsResolver as ConfigComponentsResolverModel, 171) 172from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 173 ConfigMigration as ConfigMigrationModel, 174) 175from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 176 ConfigRemapField as ConfigRemapFieldModel, 177) 178from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 179 ConfigRemoveFields as ConfigRemoveFieldsModel, 180) 181from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 182 ConstantBackoffStrategy as ConstantBackoffStrategyModel, 183) 184from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 185 CsvDecoder as CsvDecoderModel, 186) 187from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 188 CursorPagination as CursorPaginationModel, 189) 190from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 191 CustomAuthenticator as CustomAuthenticatorModel, 192) 193from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 194 CustomBackoffStrategy as CustomBackoffStrategyModel, 195) 196from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 197 CustomConfigTransformation as CustomConfigTransformationModel, 198) 199from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 200 CustomDecoder as CustomDecoderModel, 201) 202from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 203 CustomErrorHandler as CustomErrorHandlerModel, 204) 205from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 206 CustomPaginationStrategy as CustomPaginationStrategyModel, 207) 208from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 209 CustomPartitionRouter as CustomPartitionRouterModel, 210) 211from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 212 CustomRecordExtractor as CustomRecordExtractorModel, 213) 214from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 215 CustomRecordFilter as CustomRecordFilterModel, 216) 217from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 218 CustomRequester as CustomRequesterModel, 219) 220from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 221 CustomRetriever as CustomRetrieverModel, 222) 223from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 224 CustomSchemaLoader as CustomSchemaLoader, 225) 226from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 227 CustomSchemaNormalization as CustomSchemaNormalizationModel, 228) 229from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 230 CustomTransformation as CustomTransformationModel, 231) 232from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 233 CustomValidationStrategy as CustomValidationStrategyModel, 234) 235from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 236 DatetimeBasedCursor as DatetimeBasedCursorModel, 237) 238from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 239 DeclarativeStream as DeclarativeStreamModel, 240) 241from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 242 DefaultErrorHandler as DefaultErrorHandlerModel, 243) 244from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 245 DefaultPaginator as DefaultPaginatorModel, 246) 247from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 248 DpathExtractor as DpathExtractorModel, 249) 250from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 251 DpathFlattenFields as DpathFlattenFieldsModel, 252) 253from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 254 DpathValidator as DpathValidatorModel, 255) 256from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 257 DynamicSchemaLoader as DynamicSchemaLoaderModel, 258) 259from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 260 DynamicStreamCheckConfig as DynamicStreamCheckConfigModel, 261) 262from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 263 ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, 264) 265from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 266 FileUploader as FileUploaderModel, 267) 268from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 269 FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel, 270) 271from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 272 FlattenFields as FlattenFieldsModel, 273) 274from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 275 GroupByKeyMergeStrategy as GroupByKeyMergeStrategyModel, 276) 277from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 278 GroupingPartitionRouter as GroupingPartitionRouterModel, 279) 280from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 281 GzipDecoder as GzipDecoderModel, 282) 283from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 284 HTTPAPIBudget as HTTPAPIBudgetModel, 285) 286from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 287 HttpComponentsResolver as HttpComponentsResolverModel, 288) 289from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 290 HttpRequester as HttpRequesterModel, 291) 292from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 293 HttpRequestRegexMatcher as HttpRequestRegexMatcherModel, 294) 295from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 296 HttpResponseFilter as HttpResponseFilterModel, 297) 298from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 299 IncrementingCountCursor as IncrementingCountCursorModel, 300) 301from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 302 InlineSchemaLoader as InlineSchemaLoaderModel, 303) 304from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 305 IterableDecoder as IterableDecoderModel, 306) 307from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 308 JsonDecoder as JsonDecoderModel, 309) 310from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 311 JsonFileSchemaLoader as JsonFileSchemaLoaderModel, 312) 313from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 314 JsonlDecoder as JsonlDecoderModel, 315) 316from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 317 JsonSchemaPropertySelector as JsonSchemaPropertySelectorModel, 318) 319from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 320 JwtAuthenticator as JwtAuthenticatorModel, 321) 322from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 323 JwtHeaders as JwtHeadersModel, 324) 325from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 326 JwtPayload as JwtPayloadModel, 327) 328from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 329 KeysReplace as KeysReplaceModel, 330) 331from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 332 KeysToLower as KeysToLowerModel, 333) 334from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 335 KeysToSnakeCase as KeysToSnakeCaseModel, 336) 337from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 338 LegacySessionTokenAuthenticator as LegacySessionTokenAuthenticatorModel, 339) 340from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 341 LegacyToPerPartitionStateMigration as LegacyToPerPartitionStateMigrationModel, 342) 343from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 344 ListPartitionRouter as ListPartitionRouterModel, 345) 346from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 347 MinMaxDatetime as MinMaxDatetimeModel, 348) 349from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 350 MovingWindowCallRatePolicy as MovingWindowCallRatePolicyModel, 351) 352from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 353 NoAuth as NoAuthModel, 354) 355from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 356 NoPagination as NoPaginationModel, 357) 358from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 359 OAuthAuthenticator as OAuthAuthenticatorModel, 360) 361from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 362 OffsetIncrement as OffsetIncrementModel, 363) 364from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 365 PageIncrement as PageIncrementModel, 366) 367from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 368 PaginationReset as PaginationResetModel, 369) 370from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 371 ParametrizedComponentsResolver as ParametrizedComponentsResolverModel, 372) 373from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 374 ParentStreamConfig as ParentStreamConfigModel, 375) 376from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 377 PredicateValidator as PredicateValidatorModel, 378) 379from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 380 PropertiesFromEndpoint as PropertiesFromEndpointModel, 381) 382from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 383 PropertyChunking as PropertyChunkingModel, 384) 385from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 386 PropertyLimitType as PropertyLimitTypeModel, 387) 388from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 389 QueryProperties as QueryPropertiesModel, 390) 391from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 392 Rate as RateModel, 393) 394from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 395 RecordFilter as RecordFilterModel, 396) 397from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 398 RecordSelector as RecordSelectorModel, 399) 400from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 401 RefreshTokenUpdater as RefreshTokenUpdaterModel, 402) 403from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 404 RemoveFields as RemoveFieldsModel, 405) 406from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 407 RequestOption as RequestOptionModel, 408) 409from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 410 RequestPath as RequestPathModel, 411) 412from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 413 ResponseToFileExtractor as ResponseToFileExtractorModel, 414) 415from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 416 SchemaNormalization as SchemaNormalizationModel, 417) 418from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 419 SchemaTypeIdentifier as SchemaTypeIdentifierModel, 420) 421from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 422 SelectiveAuthenticator as SelectiveAuthenticatorModel, 423) 424from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 425 SessionTokenAuthenticator as SessionTokenAuthenticatorModel, 426) 427from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 428 SimpleRetriever as SimpleRetrieverModel, 429) 430from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel 431from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 432 StateDelegatingStream as StateDelegatingStreamModel, 433) 434from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 435 StreamConfig as StreamConfigModel, 436) 437from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 438 SubstreamPartitionRouter as SubstreamPartitionRouterModel, 439) 440from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 441 TypesMap as TypesMapModel, 442) 443from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 444 UnlimitedCallRatePolicy as UnlimitedCallRatePolicyModel, 445) 446from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 447 ValidateAdheresToSchema as ValidateAdheresToSchemaModel, 448) 449from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType 450from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 451 WaitTimeFromHeader as WaitTimeFromHeaderModel, 452) 453from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 454 WaitUntilTimeFromHeader as WaitUntilTimeFromHeaderModel, 455) 456from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 457 XmlDecoder as XmlDecoderModel, 458) 459from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 460 ZipfileDecoder as ZipfileDecoderModel, 461) 462from airbyte_cdk.sources.declarative.partition_routers import ( 463 CartesianProductStreamSlicer, 464 GroupingPartitionRouter, 465 ListPartitionRouter, 466 PartitionRouter, 467 SinglePartitionRouter, 468 SubstreamPartitionRouter, 469) 470from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import ( 471 AsyncJobPartitionRouter, 472) 473from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( 474 ParentStreamConfig, 475) 476from airbyte_cdk.sources.declarative.requesters import HttpRequester, RequestOption 477from airbyte_cdk.sources.declarative.requesters.error_handlers import ( 478 CompositeErrorHandler, 479 DefaultErrorHandler, 480 HttpResponseFilter, 481) 482from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ( 483 ConstantBackoffStrategy, 484 ExponentialBackoffStrategy, 485 WaitTimeFromHeaderBackoffStrategy, 486 WaitUntilTimeFromHeaderBackoffStrategy, 487) 488from airbyte_cdk.sources.declarative.requesters.http_job_repository import AsyncHttpJobRepository 489from airbyte_cdk.sources.declarative.requesters.paginators import ( 490 DefaultPaginator, 491 NoPagination, 492 PaginatorTestReadDecorator, 493) 494from airbyte_cdk.sources.declarative.requesters.paginators.strategies import ( 495 CursorPaginationStrategy, 496 CursorStopCondition, 497 OffsetIncrement, 498 PageIncrement, 499 StopConditionPaginationStrategyDecorator, 500) 501from airbyte_cdk.sources.declarative.requesters.query_properties import ( 502 PropertiesFromEndpoint, 503 PropertyChunking, 504 QueryProperties, 505) 506from airbyte_cdk.sources.declarative.requesters.query_properties.property_chunking import ( 507 PropertyLimitType, 508) 509from airbyte_cdk.sources.declarative.requesters.query_properties.property_selector import ( 510 JsonSchemaPropertySelector, 511) 512from airbyte_cdk.sources.declarative.requesters.query_properties.strategies import ( 513 GroupByKey, 514) 515from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType 516from airbyte_cdk.sources.declarative.requesters.request_options import ( 517 DatetimeBasedRequestOptionsProvider, 518 DefaultRequestOptionsProvider, 519 InterpolatedRequestOptionsProvider, 520 RequestOptionsProvider, 521) 522from airbyte_cdk.sources.declarative.requesters.request_options.per_partition_request_option_provider import ( 523 PerPartitionRequestOptionsProvider, 524) 525from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath 526from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester 527from airbyte_cdk.sources.declarative.resolvers import ( 528 ComponentMappingDefinition, 529 ConfigComponentsResolver, 530 HttpComponentsResolver, 531 ParametrizedComponentsResolver, 532 StreamConfig, 533 StreamParametersDefinition, 534) 535from airbyte_cdk.sources.declarative.retrievers import ( 536 AsyncRetriever, 537 LazySimpleRetriever, 538 SimpleRetriever, 539) 540from airbyte_cdk.sources.declarative.retrievers.file_uploader import ( 541 ConnectorBuilderFileUploader, 542 DefaultFileUploader, 543 FileUploader, 544 LocalFileSystemFileWriter, 545 NoopFileWriter, 546) 547from airbyte_cdk.sources.declarative.retrievers.pagination_tracker import PaginationTracker 548from airbyte_cdk.sources.declarative.schema import ( 549 ComplexFieldType, 550 DefaultSchemaLoader, 551 DynamicSchemaLoader, 552 InlineSchemaLoader, 553 JsonFileSchemaLoader, 554 SchemaLoader, 555 SchemaTypeIdentifier, 556 TypesMap, 557) 558from airbyte_cdk.sources.declarative.schema.caching_schema_loader_decorator import ( 559 CachingSchemaLoaderDecorator, 560) 561from airbyte_cdk.sources.declarative.schema.composite_schema_loader import CompositeSchemaLoader 562from airbyte_cdk.sources.declarative.spec import ConfigMigration, Spec 563from airbyte_cdk.sources.declarative.stream_slicers import ( 564 StreamSlicer, 565 StreamSlicerTestReadDecorator, 566) 567from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( 568 DeclarativePartitionFactory, 569 StreamSlicerPartitionGenerator, 570) 571from airbyte_cdk.sources.declarative.transformations import ( 572 AddFields, 573 RecordTransformation, 574 RemoveFields, 575) 576from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition 577from airbyte_cdk.sources.declarative.transformations.config_transformations import ( 578 ConfigAddFields, 579 ConfigRemapField, 580 ConfigRemoveFields, 581) 582from airbyte_cdk.sources.declarative.transformations.config_transformations.config_transformation import ( 583 ConfigTransformation, 584) 585from airbyte_cdk.sources.declarative.transformations.dpath_flatten_fields import ( 586 DpathFlattenFields, 587 KeyTransformation, 588) 589from airbyte_cdk.sources.declarative.transformations.flatten_fields import ( 590 FlattenFields, 591) 592from airbyte_cdk.sources.declarative.transformations.keys_replace_transformation import ( 593 KeysReplaceTransformation, 594) 595from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import ( 596 KeysToLowerTransformation, 597) 598from airbyte_cdk.sources.declarative.transformations.keys_to_snake_transformation import ( 599 KeysToSnakeCaseTransformation, 600) 601from airbyte_cdk.sources.declarative.validators import ( 602 DpathValidator, 603 PredicateValidator, 604 ValidateAdheresToSchema, 605) 606from airbyte_cdk.sources.http_logger import format_http_message 607from airbyte_cdk.sources.message import ( 608 InMemoryMessageRepository, 609 LogAppenderMessageRepositoryDecorator, 610 MessageRepository, 611 NoopMessageRepository, 612) 613from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository 614from airbyte_cdk.sources.streams.call_rate import ( 615 APIBudget, 616 FixedWindowCallRatePolicy, 617 HttpAPIBudget, 618 HttpRequestRegexMatcher, 619 MovingWindowCallRatePolicy, 620 Rate, 621 UnlimitedCallRatePolicy, 622) 623from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream 624from airbyte_cdk.sources.streams.concurrent.clamping import ( 625 ClampingEndProvider, 626 ClampingStrategy, 627 DayClampingStrategy, 628 MonthClampingStrategy, 629 NoClamping, 630 WeekClampingStrategy, 631 Weekday, 632) 633from airbyte_cdk.sources.streams.concurrent.cursor import ( 634 ConcurrentCursor, 635 Cursor, 636 CursorField, 637 FinalStateCursor, 638) 639from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream 640from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream 641from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import ( 642 StreamSlicer as ConcurrentStreamSlicer, 643) 644from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( 645 CustomFormatConcurrentStreamStateConverter, 646 DateTimeStreamStateConverter, 647) 648from airbyte_cdk.sources.streams.concurrent.state_converters.incrementing_count_stream_state_converter import ( 649 IncrementingCountStreamStateConverter, 650) 651from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction 652from airbyte_cdk.sources.types import Config 653from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer 654 655ComponentDefinition = Mapping[str, Any] 656 657SCHEMA_TRANSFORMER_TYPE_MAPPING = { 658 SchemaNormalizationModel.None_: TransformConfig.NoTransform, 659 SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization, 660} 661_NO_STREAM_SLICING = SinglePartitionRouter(parameters={}) 662 663# Ideally this should use the value defined in ConcurrentDeclarativeSource, but 664# this would be a circular import 665MAX_SLICES = 5 666 667LOGGER = logging.getLogger(f"airbyte.model_to_component_factory") 668 669 670class ModelToComponentFactory: 671 EPOCH_DATETIME_FORMAT = "%s" 672 673 def __init__( 674 self, 675 limit_pages_fetched_per_slice: Optional[int] = None, 676 limit_slices_fetched: Optional[int] = None, 677 emit_connector_builder_messages: bool = False, 678 disable_retries: bool = False, 679 disable_cache: bool = False, 680 message_repository: Optional[MessageRepository] = None, 681 connector_state_manager: Optional[ConnectorStateManager] = None, 682 max_concurrent_async_job_count: Optional[int] = None, 683 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 684 api_budget: Optional[APIBudget] = None, 685 ): 686 self._init_mappings() 687 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 688 self._limit_slices_fetched = limit_slices_fetched 689 self._emit_connector_builder_messages = emit_connector_builder_messages 690 self._disable_retries = disable_retries 691 self._disable_cache = disable_cache 692 self._message_repository = message_repository or InMemoryMessageRepository( 693 self._evaluate_log_level(emit_connector_builder_messages) 694 ) 695 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 696 configured_catalog 697 ) 698 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 699 self._api_budget: Optional[Union[APIBudget]] = api_budget 700 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 701 # placeholder for deprecation warnings 702 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 703 704 def _init_mappings(self) -> None: 705 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 706 AddedFieldDefinitionModel: self.create_added_field_definition, 707 AddFieldsModel: self.create_add_fields, 708 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 709 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 710 BearerAuthenticatorModel: self.create_bearer_authenticator, 711 CheckStreamModel: self.create_check_stream, 712 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 713 CheckDynamicStreamModel: self.create_check_dynamic_stream, 714 CompositeErrorHandlerModel: self.create_composite_error_handler, 715 ConcurrencyLevelModel: self.create_concurrency_level, 716 ConfigMigrationModel: self.create_config_migration, 717 ConfigAddFieldsModel: self.create_config_add_fields, 718 ConfigRemapFieldModel: self.create_config_remap_field, 719 ConfigRemoveFieldsModel: self.create_config_remove_fields, 720 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 721 CsvDecoderModel: self.create_csv_decoder, 722 CursorPaginationModel: self.create_cursor_pagination, 723 CustomAuthenticatorModel: self.create_custom_component, 724 CustomBackoffStrategyModel: self.create_custom_component, 725 CustomDecoderModel: self.create_custom_component, 726 CustomErrorHandlerModel: self.create_custom_component, 727 CustomRecordExtractorModel: self.create_custom_component, 728 CustomRecordFilterModel: self.create_custom_component, 729 CustomRequesterModel: self.create_custom_component, 730 CustomRetrieverModel: self.create_custom_component, 731 CustomSchemaLoader: self.create_custom_component, 732 CustomSchemaNormalizationModel: self.create_custom_component, 733 CustomStateMigration: self.create_custom_component, 734 CustomPaginationStrategyModel: self.create_custom_component, 735 CustomPartitionRouterModel: self.create_custom_component, 736 CustomTransformationModel: self.create_custom_component, 737 CustomValidationStrategyModel: self.create_custom_component, 738 CustomConfigTransformationModel: self.create_custom_component, 739 DeclarativeStreamModel: self.create_default_stream, 740 DefaultErrorHandlerModel: self.create_default_error_handler, 741 DefaultPaginatorModel: self.create_default_paginator, 742 DpathExtractorModel: self.create_dpath_extractor, 743 DpathValidatorModel: self.create_dpath_validator, 744 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 745 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 746 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 747 GroupByKeyMergeStrategyModel: self.create_group_by_key, 748 HttpRequesterModel: self.create_http_requester, 749 HttpResponseFilterModel: self.create_http_response_filter, 750 InlineSchemaLoaderModel: self.create_inline_schema_loader, 751 JsonDecoderModel: self.create_json_decoder, 752 JsonlDecoderModel: self.create_jsonl_decoder, 753 JsonSchemaPropertySelectorModel: self.create_json_schema_property_selector, 754 GzipDecoderModel: self.create_gzip_decoder, 755 KeysToLowerModel: self.create_keys_to_lower_transformation, 756 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 757 KeysReplaceModel: self.create_keys_replace_transformation, 758 FlattenFieldsModel: self.create_flatten_fields, 759 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 760 IterableDecoderModel: self.create_iterable_decoder, 761 XmlDecoderModel: self.create_xml_decoder, 762 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 763 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 764 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 765 TypesMapModel: self.create_types_map, 766 ComplexFieldTypeModel: self.create_complex_field_type, 767 JwtAuthenticatorModel: self.create_jwt_authenticator, 768 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 769 ListPartitionRouterModel: self.create_list_partition_router, 770 MinMaxDatetimeModel: self.create_min_max_datetime, 771 NoAuthModel: self.create_no_auth, 772 NoPaginationModel: self.create_no_pagination, 773 OAuthAuthenticatorModel: self.create_oauth_authenticator, 774 OffsetIncrementModel: self.create_offset_increment, 775 PageIncrementModel: self.create_page_increment, 776 ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, 777 PredicateValidatorModel: self.create_predicate_validator, 778 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 779 PropertyChunkingModel: self.create_property_chunking, 780 QueryPropertiesModel: self.create_query_properties, 781 RecordFilterModel: self.create_record_filter, 782 RecordSelectorModel: self.create_record_selector, 783 RemoveFieldsModel: self.create_remove_fields, 784 RequestPathModel: self.create_request_path, 785 RequestOptionModel: self.create_request_option, 786 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 787 SelectiveAuthenticatorModel: self.create_selective_authenticator, 788 SimpleRetrieverModel: self.create_simple_retriever, 789 StateDelegatingStreamModel: self.create_state_delegating_stream, 790 SpecModel: self.create_spec, 791 SubstreamPartitionRouterModel: self.create_substream_partition_router, 792 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 793 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 794 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 795 AsyncRetrieverModel: self.create_async_retriever, 796 HttpComponentsResolverModel: self.create_http_components_resolver, 797 ConfigComponentsResolverModel: self.create_config_components_resolver, 798 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 799 StreamConfigModel: self.create_stream_config, 800 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 801 ZipfileDecoderModel: self.create_zipfile_decoder, 802 HTTPAPIBudgetModel: self.create_http_api_budget, 803 FileUploaderModel: self.create_file_uploader, 804 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 805 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 806 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 807 RateModel: self.create_rate, 808 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 809 GroupingPartitionRouterModel: self.create_grouping_partition_router, 810 } 811 812 # Needed for the case where we need to perform a second parse on the fields of a custom component 813 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 814 815 @staticmethod 816 def _create_stream_name_to_configured_stream( 817 configured_catalog: Optional[ConfiguredAirbyteCatalog], 818 ) -> Mapping[str, ConfiguredAirbyteStream]: 819 return ( 820 {stream.stream.name: stream for stream in configured_catalog.streams} 821 if configured_catalog 822 else {} 823 ) 824 825 def create_component( 826 self, 827 model_type: Type[BaseModel], 828 component_definition: ComponentDefinition, 829 config: Config, 830 **kwargs: Any, 831 ) -> Any: 832 """ 833 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 834 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 835 creating declarative components from that model. 836 837 :param model_type: The type of declarative component that is being initialized 838 :param component_definition: The mapping that represents a declarative component 839 :param config: The connector config that is provided by the customer 840 :return: The declarative component to be used at runtime 841 """ 842 843 component_type = component_definition.get("type") 844 if component_definition.get("type") != model_type.__name__: 845 raise ValueError( 846 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 847 ) 848 849 declarative_component_model = model_type.parse_obj(component_definition) 850 851 if not isinstance(declarative_component_model, model_type): 852 raise ValueError( 853 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 854 ) 855 856 return self._create_component_from_model( 857 model=declarative_component_model, config=config, **kwargs 858 ) 859 860 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 861 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 862 raise ValueError( 863 f"{model.__class__} with attributes {model} is not a valid component type" 864 ) 865 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 866 if not component_constructor: 867 raise ValueError(f"Could not find constructor for {model.__class__}") 868 869 # collect deprecation warnings for supported models. 870 if isinstance(model, BaseModelWithDeprecations): 871 self._collect_model_deprecations(model) 872 873 return component_constructor(model=model, config=config, **kwargs) 874 875 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 876 """ 877 Returns the deprecation warnings that were collected during the creation of components. 878 """ 879 return self._collected_deprecation_logs 880 881 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 882 """ 883 Collects deprecation logs from the given model and appends any new logs to the internal collection. 884 885 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 886 887 Args: 888 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 889 """ 890 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 891 for log in model._deprecation_logs: 892 # avoid duplicates for deprecation logs observed. 893 if log not in self._collected_deprecation_logs: 894 self._collected_deprecation_logs.append(log) 895 896 def create_config_migration( 897 self, model: ConfigMigrationModel, config: Config 898 ) -> ConfigMigration: 899 transformations: List[ConfigTransformation] = [ 900 self._create_component_from_model(transformation, config) 901 for transformation in model.transformations 902 ] 903 904 return ConfigMigration( 905 description=model.description, 906 transformations=transformations, 907 ) 908 909 def create_config_add_fields( 910 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 911 ) -> ConfigAddFields: 912 fields = [self._create_component_from_model(field, config) for field in model.fields] 913 return ConfigAddFields( 914 fields=fields, 915 condition=model.condition or "", 916 ) 917 918 @staticmethod 919 def create_config_remove_fields( 920 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 921 ) -> ConfigRemoveFields: 922 return ConfigRemoveFields( 923 field_pointers=model.field_pointers, 924 condition=model.condition or "", 925 ) 926 927 @staticmethod 928 def create_config_remap_field( 929 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 930 ) -> ConfigRemapField: 931 mapping = cast(Mapping[str, Any], model.map) 932 return ConfigRemapField( 933 map=mapping, 934 field_path=model.field_path, 935 config=config, 936 ) 937 938 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 939 strategy = self._create_component_from_model(model.validation_strategy, config) 940 941 return DpathValidator( 942 field_path=model.field_path, 943 strategy=strategy, 944 ) 945 946 def create_predicate_validator( 947 self, model: PredicateValidatorModel, config: Config 948 ) -> PredicateValidator: 949 strategy = self._create_component_from_model(model.validation_strategy, config) 950 951 return PredicateValidator( 952 value=model.value, 953 strategy=strategy, 954 ) 955 956 @staticmethod 957 def create_validate_adheres_to_schema( 958 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 959 ) -> ValidateAdheresToSchema: 960 base_schema = cast(Mapping[str, Any], model.base_schema) 961 return ValidateAdheresToSchema( 962 schema=base_schema, 963 ) 964 965 @staticmethod 966 def create_added_field_definition( 967 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 968 ) -> AddedFieldDefinition: 969 interpolated_value = InterpolatedString.create( 970 model.value, parameters=model.parameters or {} 971 ) 972 return AddedFieldDefinition( 973 path=model.path, 974 value=interpolated_value, 975 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 976 parameters=model.parameters or {}, 977 ) 978 979 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 980 added_field_definitions = [ 981 self._create_component_from_model( 982 model=added_field_definition_model, 983 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 984 added_field_definition_model.value_type 985 ), 986 config=config, 987 ) 988 for added_field_definition_model in model.fields 989 ] 990 return AddFields( 991 fields=added_field_definitions, 992 condition=model.condition or "", 993 parameters=model.parameters or {}, 994 ) 995 996 def create_keys_to_lower_transformation( 997 self, model: KeysToLowerModel, config: Config, **kwargs: Any 998 ) -> KeysToLowerTransformation: 999 return KeysToLowerTransformation() 1000 1001 def create_keys_to_snake_transformation( 1002 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 1003 ) -> KeysToSnakeCaseTransformation: 1004 return KeysToSnakeCaseTransformation() 1005 1006 def create_keys_replace_transformation( 1007 self, model: KeysReplaceModel, config: Config, **kwargs: Any 1008 ) -> KeysReplaceTransformation: 1009 return KeysReplaceTransformation( 1010 old=model.old, new=model.new, parameters=model.parameters or {} 1011 ) 1012 1013 def create_flatten_fields( 1014 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 1015 ) -> FlattenFields: 1016 return FlattenFields( 1017 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 1018 ) 1019 1020 def create_dpath_flatten_fields( 1021 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1022 ) -> DpathFlattenFields: 1023 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1024 key_transformation = ( 1025 KeyTransformation( 1026 config=config, 1027 prefix=model.key_transformation.prefix, 1028 suffix=model.key_transformation.suffix, 1029 parameters=model.parameters or {}, 1030 ) 1031 if model.key_transformation is not None 1032 else None 1033 ) 1034 return DpathFlattenFields( 1035 config=config, 1036 field_path=model_field_path, 1037 delete_origin_value=model.delete_origin_value 1038 if model.delete_origin_value is not None 1039 else False, 1040 replace_record=model.replace_record if model.replace_record is not None else False, 1041 key_transformation=key_transformation, 1042 parameters=model.parameters or {}, 1043 ) 1044 1045 @staticmethod 1046 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 1047 if not value_type: 1048 return None 1049 names_to_types = { 1050 ValueType.string: str, 1051 ValueType.number: float, 1052 ValueType.integer: int, 1053 ValueType.boolean: bool, 1054 } 1055 return names_to_types[value_type] 1056 1057 def create_api_key_authenticator( 1058 self, 1059 model: ApiKeyAuthenticatorModel, 1060 config: Config, 1061 token_provider: Optional[TokenProvider] = None, 1062 **kwargs: Any, 1063 ) -> ApiKeyAuthenticator: 1064 if model.inject_into is None and model.header is None: 1065 raise ValueError( 1066 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1067 ) 1068 1069 if model.inject_into is not None and model.header is not None: 1070 raise ValueError( 1071 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1072 ) 1073 1074 if token_provider is not None and model.api_token != "": 1075 raise ValueError( 1076 "If token_provider is set, api_token is ignored and has to be set to empty string." 1077 ) 1078 1079 request_option = ( 1080 self._create_component_from_model( 1081 model.inject_into, config, parameters=model.parameters or {} 1082 ) 1083 if model.inject_into 1084 else RequestOption( 1085 inject_into=RequestOptionType.header, 1086 field_name=model.header or "", 1087 parameters=model.parameters or {}, 1088 ) 1089 ) 1090 1091 return ApiKeyAuthenticator( 1092 token_provider=( 1093 token_provider 1094 if token_provider is not None 1095 else InterpolatedStringTokenProvider( 1096 api_token=model.api_token or "", 1097 config=config, 1098 parameters=model.parameters or {}, 1099 ) 1100 ), 1101 request_option=request_option, 1102 config=config, 1103 parameters=model.parameters or {}, 1104 ) 1105 1106 def create_legacy_to_per_partition_state_migration( 1107 self, 1108 model: LegacyToPerPartitionStateMigrationModel, 1109 config: Mapping[str, Any], 1110 declarative_stream: DeclarativeStreamModel, 1111 ) -> LegacyToPerPartitionStateMigration: 1112 retriever = declarative_stream.retriever 1113 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1114 raise ValueError( 1115 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1116 ) 1117 partition_router = retriever.partition_router 1118 if not isinstance( 1119 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1120 ): 1121 raise ValueError( 1122 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1123 ) 1124 if not hasattr(partition_router, "parent_stream_configs"): 1125 raise ValueError( 1126 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1127 ) 1128 1129 if not hasattr(declarative_stream, "incremental_sync"): 1130 raise ValueError( 1131 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1132 ) 1133 1134 return LegacyToPerPartitionStateMigration( 1135 partition_router, # type: ignore # was already checked above 1136 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1137 config, 1138 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1139 ) 1140 1141 def create_session_token_authenticator( 1142 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1143 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1144 decoder = ( 1145 self._create_component_from_model(model=model.decoder, config=config) 1146 if model.decoder 1147 else JsonDecoder(parameters={}) 1148 ) 1149 login_requester = self._create_component_from_model( 1150 model=model.login_requester, 1151 config=config, 1152 name=f"{name}_login_requester", 1153 decoder=decoder, 1154 ) 1155 token_provider = SessionTokenProvider( 1156 login_requester=login_requester, 1157 session_token_path=model.session_token_path, 1158 expiration_duration=parse_duration(model.expiration_duration) 1159 if model.expiration_duration 1160 else None, 1161 parameters=model.parameters or {}, 1162 message_repository=self._message_repository, 1163 decoder=decoder, 1164 ) 1165 if model.request_authentication.type == "Bearer": 1166 return ModelToComponentFactory.create_bearer_authenticator( 1167 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1168 config, 1169 token_provider=token_provider, 1170 ) 1171 else: 1172 return self.create_api_key_authenticator( 1173 ApiKeyAuthenticatorModel( 1174 type="ApiKeyAuthenticator", 1175 api_token="", 1176 inject_into=model.request_authentication.inject_into, 1177 ), # type: ignore # $parameters and headers default to None 1178 config=config, 1179 token_provider=token_provider, 1180 ) 1181 1182 @staticmethod 1183 def create_basic_http_authenticator( 1184 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1185 ) -> BasicHttpAuthenticator: 1186 return BasicHttpAuthenticator( 1187 password=model.password or "", 1188 username=model.username, 1189 config=config, 1190 parameters=model.parameters or {}, 1191 ) 1192 1193 @staticmethod 1194 def create_bearer_authenticator( 1195 model: BearerAuthenticatorModel, 1196 config: Config, 1197 token_provider: Optional[TokenProvider] = None, 1198 **kwargs: Any, 1199 ) -> BearerAuthenticator: 1200 if token_provider is not None and model.api_token != "": 1201 raise ValueError( 1202 "If token_provider is set, api_token is ignored and has to be set to empty string." 1203 ) 1204 return BearerAuthenticator( 1205 token_provider=( 1206 token_provider 1207 if token_provider is not None 1208 else InterpolatedStringTokenProvider( 1209 api_token=model.api_token or "", 1210 config=config, 1211 parameters=model.parameters or {}, 1212 ) 1213 ), 1214 config=config, 1215 parameters=model.parameters or {}, 1216 ) 1217 1218 @staticmethod 1219 def create_dynamic_stream_check_config( 1220 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1221 ) -> DynamicStreamCheckConfig: 1222 return DynamicStreamCheckConfig( 1223 dynamic_stream_name=model.dynamic_stream_name, 1224 stream_count=model.stream_count or 0, 1225 ) 1226 1227 def create_check_stream( 1228 self, model: CheckStreamModel, config: Config, **kwargs: Any 1229 ) -> CheckStream: 1230 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1231 raise ValueError( 1232 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1233 ) 1234 1235 dynamic_streams_check_configs = ( 1236 [ 1237 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1238 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1239 ] 1240 if model.dynamic_streams_check_configs 1241 else [] 1242 ) 1243 1244 return CheckStream( 1245 stream_names=model.stream_names or [], 1246 dynamic_streams_check_configs=dynamic_streams_check_configs, 1247 parameters={}, 1248 ) 1249 1250 @staticmethod 1251 def create_check_dynamic_stream( 1252 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1253 ) -> CheckDynamicStream: 1254 assert model.use_check_availability is not None # for mypy 1255 1256 use_check_availability = model.use_check_availability 1257 1258 return CheckDynamicStream( 1259 stream_count=model.stream_count, 1260 use_check_availability=use_check_availability, 1261 parameters={}, 1262 ) 1263 1264 def create_composite_error_handler( 1265 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1266 ) -> CompositeErrorHandler: 1267 error_handlers = [ 1268 self._create_component_from_model(model=error_handler_model, config=config) 1269 for error_handler_model in model.error_handlers 1270 ] 1271 return CompositeErrorHandler( 1272 error_handlers=error_handlers, parameters=model.parameters or {} 1273 ) 1274 1275 @staticmethod 1276 def create_concurrency_level( 1277 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1278 ) -> ConcurrencyLevel: 1279 return ConcurrencyLevel( 1280 default_concurrency=model.default_concurrency, 1281 max_concurrency=model.max_concurrency, 1282 config=config, 1283 parameters={}, 1284 ) 1285 1286 @staticmethod 1287 def apply_stream_state_migrations( 1288 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1289 ) -> MutableMapping[str, Any]: 1290 if stream_state_migrations: 1291 for state_migration in stream_state_migrations: 1292 if state_migration.should_migrate(stream_state): 1293 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1294 stream_state = dict(state_migration.migrate(stream_state)) 1295 return stream_state 1296 1297 def create_concurrent_cursor_from_datetime_based_cursor( 1298 self, 1299 model_type: Type[BaseModel], 1300 component_definition: ComponentDefinition, 1301 stream_name: str, 1302 stream_namespace: Optional[str], 1303 stream_state: MutableMapping[str, Any], 1304 config: Config, 1305 message_repository: Optional[MessageRepository] = None, 1306 runtime_lookback_window: Optional[datetime.timedelta] = None, 1307 **kwargs: Any, 1308 ) -> ConcurrentCursor: 1309 component_type = component_definition.get("type") 1310 if component_definition.get("type") != model_type.__name__: 1311 raise ValueError( 1312 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1313 ) 1314 1315 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1316 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1317 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1318 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1319 if "$parameters" not in component_definition and "parameters" in component_definition: 1320 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1321 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1322 1323 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1324 raise ValueError( 1325 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1326 ) 1327 1328 model_parameters = datetime_based_cursor_model.parameters or {} 1329 interpolated_cursor_field = InterpolatedString.create( 1330 datetime_based_cursor_model.cursor_field, 1331 parameters=model_parameters, 1332 ) 1333 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1334 1335 interpolated_partition_field_start = InterpolatedString.create( 1336 datetime_based_cursor_model.partition_field_start or "start_time", 1337 parameters=model_parameters, 1338 ) 1339 interpolated_partition_field_end = InterpolatedString.create( 1340 datetime_based_cursor_model.partition_field_end or "end_time", 1341 parameters=model_parameters, 1342 ) 1343 1344 slice_boundary_fields = ( 1345 interpolated_partition_field_start.eval(config=config), 1346 interpolated_partition_field_end.eval(config=config), 1347 ) 1348 1349 datetime_format = datetime_based_cursor_model.datetime_format 1350 1351 cursor_granularity = ( 1352 parse_duration(datetime_based_cursor_model.cursor_granularity) 1353 if datetime_based_cursor_model.cursor_granularity 1354 else None 1355 ) 1356 1357 lookback_window = None 1358 interpolated_lookback_window = ( 1359 InterpolatedString.create( 1360 datetime_based_cursor_model.lookback_window, 1361 parameters=model_parameters, 1362 ) 1363 if datetime_based_cursor_model.lookback_window 1364 else None 1365 ) 1366 if interpolated_lookback_window: 1367 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1368 if evaluated_lookback_window: 1369 lookback_window = parse_duration(evaluated_lookback_window) 1370 1371 connector_state_converter: DateTimeStreamStateConverter 1372 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1373 datetime_format=datetime_format, 1374 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1375 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1376 cursor_granularity=cursor_granularity, 1377 ) 1378 1379 # Adjusts the stream state by applying the runtime lookback window. 1380 # This is used to ensure correct state handling in case of failed partitions. 1381 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1382 if runtime_lookback_window and stream_state_value: 1383 new_stream_state = ( 1384 connector_state_converter.parse_timestamp(stream_state_value) 1385 - runtime_lookback_window 1386 ) 1387 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1388 new_stream_state 1389 ) 1390 1391 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1392 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1393 start_date_runtime_value = self.create_min_max_datetime( 1394 model=datetime_based_cursor_model.start_datetime, config=config 1395 ) 1396 else: 1397 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1398 1399 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1400 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1401 end_date_runtime_value = self.create_min_max_datetime( 1402 model=datetime_based_cursor_model.end_datetime, config=config 1403 ) 1404 else: 1405 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1406 1407 interpolated_start_date = MinMaxDatetime.create( 1408 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1409 parameters=datetime_based_cursor_model.parameters, 1410 ) 1411 interpolated_end_date = ( 1412 None 1413 if not end_date_runtime_value 1414 else MinMaxDatetime.create( 1415 end_date_runtime_value, datetime_based_cursor_model.parameters 1416 ) 1417 ) 1418 1419 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1420 if not interpolated_start_date.datetime_format: 1421 interpolated_start_date.datetime_format = datetime_format 1422 if interpolated_end_date and not interpolated_end_date.datetime_format: 1423 interpolated_end_date.datetime_format = datetime_format 1424 1425 start_date = interpolated_start_date.get_datetime(config=config) 1426 end_date_provider = ( 1427 partial(interpolated_end_date.get_datetime, config) 1428 if interpolated_end_date 1429 else connector_state_converter.get_end_provider() 1430 ) 1431 1432 if ( 1433 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1434 ) or ( 1435 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1436 ): 1437 raise ValueError( 1438 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1439 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1440 ) 1441 1442 # When step is not defined, default to a step size from the starting date to the present moment 1443 step_length = datetime.timedelta.max 1444 interpolated_step = ( 1445 InterpolatedString.create( 1446 datetime_based_cursor_model.step, 1447 parameters=model_parameters, 1448 ) 1449 if datetime_based_cursor_model.step 1450 else None 1451 ) 1452 if interpolated_step: 1453 evaluated_step = interpolated_step.eval(config) 1454 if evaluated_step: 1455 step_length = parse_duration(evaluated_step) 1456 1457 clamping_strategy: ClampingStrategy = NoClamping() 1458 if datetime_based_cursor_model.clamping: 1459 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1460 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1461 # object which we want to keep agnostic of being low-code 1462 target = InterpolatedString( 1463 string=datetime_based_cursor_model.clamping.target, 1464 parameters=model_parameters, 1465 ) 1466 evaluated_target = target.eval(config=config) 1467 match evaluated_target: 1468 case "DAY": 1469 clamping_strategy = DayClampingStrategy() 1470 end_date_provider = ClampingEndProvider( 1471 DayClampingStrategy(is_ceiling=False), 1472 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1473 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1474 ) 1475 case "WEEK": 1476 if ( 1477 not datetime_based_cursor_model.clamping.target_details 1478 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1479 ): 1480 raise ValueError( 1481 "Given WEEK clamping, weekday needs to be provided as target_details" 1482 ) 1483 weekday = self._assemble_weekday( 1484 datetime_based_cursor_model.clamping.target_details["weekday"] 1485 ) 1486 clamping_strategy = WeekClampingStrategy(weekday) 1487 end_date_provider = ClampingEndProvider( 1488 WeekClampingStrategy(weekday, is_ceiling=False), 1489 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1490 granularity=cursor_granularity or datetime.timedelta(days=1), 1491 ) 1492 case "MONTH": 1493 clamping_strategy = MonthClampingStrategy() 1494 end_date_provider = ClampingEndProvider( 1495 MonthClampingStrategy(is_ceiling=False), 1496 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1497 granularity=cursor_granularity or datetime.timedelta(days=1), 1498 ) 1499 case _: 1500 raise ValueError( 1501 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1502 ) 1503 1504 return ConcurrentCursor( 1505 stream_name=stream_name, 1506 stream_namespace=stream_namespace, 1507 stream_state=stream_state, 1508 message_repository=message_repository or self._message_repository, 1509 connector_state_manager=self._connector_state_manager, 1510 connector_state_converter=connector_state_converter, 1511 cursor_field=cursor_field, 1512 slice_boundary_fields=slice_boundary_fields, 1513 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1514 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1515 lookback_window=lookback_window, 1516 slice_range=step_length, 1517 cursor_granularity=cursor_granularity, 1518 clamping_strategy=clamping_strategy, 1519 ) 1520 1521 def create_concurrent_cursor_from_incrementing_count_cursor( 1522 self, 1523 model_type: Type[BaseModel], 1524 component_definition: ComponentDefinition, 1525 stream_name: str, 1526 stream_namespace: Optional[str], 1527 stream_state: MutableMapping[str, Any], 1528 config: Config, 1529 message_repository: Optional[MessageRepository] = None, 1530 **kwargs: Any, 1531 ) -> ConcurrentCursor: 1532 component_type = component_definition.get("type") 1533 if component_definition.get("type") != model_type.__name__: 1534 raise ValueError( 1535 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1536 ) 1537 1538 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1539 1540 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1541 raise ValueError( 1542 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1543 ) 1544 1545 interpolated_start_value = ( 1546 InterpolatedString.create( 1547 incrementing_count_cursor_model.start_value, # type: ignore 1548 parameters=incrementing_count_cursor_model.parameters or {}, 1549 ) 1550 if incrementing_count_cursor_model.start_value 1551 else 0 1552 ) 1553 1554 interpolated_cursor_field = InterpolatedString.create( 1555 incrementing_count_cursor_model.cursor_field, 1556 parameters=incrementing_count_cursor_model.parameters or {}, 1557 ) 1558 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1559 1560 connector_state_converter = IncrementingCountStreamStateConverter( 1561 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1562 ) 1563 1564 return ConcurrentCursor( 1565 stream_name=stream_name, 1566 stream_namespace=stream_namespace, 1567 stream_state=stream_state, 1568 message_repository=message_repository or self._message_repository, 1569 connector_state_manager=self._connector_state_manager, 1570 connector_state_converter=connector_state_converter, 1571 cursor_field=cursor_field, 1572 slice_boundary_fields=None, 1573 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1574 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1575 ) 1576 1577 def _assemble_weekday(self, weekday: str) -> Weekday: 1578 match weekday: 1579 case "MONDAY": 1580 return Weekday.MONDAY 1581 case "TUESDAY": 1582 return Weekday.TUESDAY 1583 case "WEDNESDAY": 1584 return Weekday.WEDNESDAY 1585 case "THURSDAY": 1586 return Weekday.THURSDAY 1587 case "FRIDAY": 1588 return Weekday.FRIDAY 1589 case "SATURDAY": 1590 return Weekday.SATURDAY 1591 case "SUNDAY": 1592 return Weekday.SUNDAY 1593 case _: 1594 raise ValueError(f"Unknown weekday {weekday}") 1595 1596 def create_concurrent_cursor_from_perpartition_cursor( 1597 self, 1598 state_manager: ConnectorStateManager, 1599 model_type: Type[BaseModel], 1600 component_definition: ComponentDefinition, 1601 stream_name: str, 1602 stream_namespace: Optional[str], 1603 config: Config, 1604 stream_state: MutableMapping[str, Any], 1605 partition_router: PartitionRouter, 1606 attempt_to_create_cursor_if_not_provided: bool = False, 1607 **kwargs: Any, 1608 ) -> ConcurrentPerPartitionCursor: 1609 component_type = component_definition.get("type") 1610 if component_definition.get("type") != model_type.__name__: 1611 raise ValueError( 1612 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1613 ) 1614 1615 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1616 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1617 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1618 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1619 if "$parameters" not in component_definition and "parameters" in component_definition: 1620 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1621 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1622 1623 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1624 raise ValueError( 1625 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1626 ) 1627 1628 interpolated_cursor_field = InterpolatedString.create( 1629 datetime_based_cursor_model.cursor_field, 1630 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1631 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1632 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1633 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1634 parameters=datetime_based_cursor_model.parameters or {}, 1635 ) 1636 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1637 1638 datetime_format = datetime_based_cursor_model.datetime_format 1639 1640 cursor_granularity = ( 1641 parse_duration(datetime_based_cursor_model.cursor_granularity) 1642 if datetime_based_cursor_model.cursor_granularity 1643 else None 1644 ) 1645 1646 connector_state_converter: DateTimeStreamStateConverter 1647 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1648 datetime_format=datetime_format, 1649 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1650 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1651 cursor_granularity=cursor_granularity, 1652 ) 1653 1654 # Create the cursor factory 1655 cursor_factory = ConcurrentCursorFactory( 1656 partial( 1657 self.create_concurrent_cursor_from_datetime_based_cursor, 1658 state_manager=state_manager, 1659 model_type=model_type, 1660 component_definition=component_definition, 1661 stream_name=stream_name, 1662 stream_namespace=stream_namespace, 1663 config=config, 1664 message_repository=NoopMessageRepository(), 1665 ) 1666 ) 1667 1668 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1669 use_global_cursor = isinstance( 1670 partition_router, GroupingPartitionRouter 1671 ) or component_definition.get("global_substream_cursor", False) 1672 1673 # Return the concurrent cursor and state converter 1674 return ConcurrentPerPartitionCursor( 1675 cursor_factory=cursor_factory, 1676 partition_router=partition_router, 1677 stream_name=stream_name, 1678 stream_namespace=stream_namespace, 1679 stream_state=stream_state, 1680 message_repository=self._message_repository, # type: ignore 1681 connector_state_manager=state_manager, 1682 connector_state_converter=connector_state_converter, 1683 cursor_field=cursor_field, 1684 use_global_cursor=use_global_cursor, 1685 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1686 ) 1687 1688 @staticmethod 1689 def create_constant_backoff_strategy( 1690 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1691 ) -> ConstantBackoffStrategy: 1692 return ConstantBackoffStrategy( 1693 backoff_time_in_seconds=model.backoff_time_in_seconds, 1694 config=config, 1695 parameters=model.parameters or {}, 1696 ) 1697 1698 def create_cursor_pagination( 1699 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1700 ) -> CursorPaginationStrategy: 1701 if isinstance(decoder, PaginationDecoderDecorator): 1702 inner_decoder = decoder.decoder 1703 else: 1704 inner_decoder = decoder 1705 decoder = PaginationDecoderDecorator(decoder=decoder) 1706 1707 if self._is_supported_decoder_for_pagination(inner_decoder): 1708 decoder_to_use = decoder 1709 else: 1710 raise ValueError( 1711 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1712 ) 1713 1714 return CursorPaginationStrategy( 1715 cursor_value=model.cursor_value, 1716 decoder=decoder_to_use, 1717 page_size=model.page_size, 1718 stop_condition=model.stop_condition, 1719 config=config, 1720 parameters=model.parameters or {}, 1721 ) 1722 1723 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1724 """ 1725 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1726 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1727 :param model: The Pydantic model of the custom component being created 1728 :param config: The custom defined connector config 1729 :return: The declarative component built from the Pydantic model to be used at runtime 1730 """ 1731 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1732 component_fields = get_type_hints(custom_component_class) 1733 model_args = model.dict() 1734 model_args["config"] = config 1735 1736 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1737 # we defer to these arguments over the component's definition 1738 for key, arg in kwargs.items(): 1739 model_args[key] = arg 1740 1741 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1742 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1743 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1744 for model_field, model_value in model_args.items(): 1745 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1746 if ( 1747 isinstance(model_value, dict) 1748 and "type" not in model_value 1749 and model_field in component_fields 1750 ): 1751 derived_type = self._derive_component_type_from_type_hints( 1752 component_fields.get(model_field) 1753 ) 1754 if derived_type: 1755 model_value["type"] = derived_type 1756 1757 if self._is_component(model_value): 1758 model_args[model_field] = self._create_nested_component( 1759 model, 1760 model_field, 1761 model_value, 1762 config, 1763 **kwargs, 1764 ) 1765 elif isinstance(model_value, list): 1766 vals = [] 1767 for v in model_value: 1768 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1769 derived_type = self._derive_component_type_from_type_hints( 1770 component_fields.get(model_field) 1771 ) 1772 if derived_type: 1773 v["type"] = derived_type 1774 if self._is_component(v): 1775 vals.append( 1776 self._create_nested_component( 1777 model, 1778 model_field, 1779 v, 1780 config, 1781 **kwargs, 1782 ) 1783 ) 1784 else: 1785 vals.append(v) 1786 model_args[model_field] = vals 1787 1788 kwargs = { 1789 class_field: model_args[class_field] 1790 for class_field in component_fields.keys() 1791 if class_field in model_args 1792 } 1793 return custom_component_class(**kwargs) 1794 1795 @staticmethod 1796 def _get_class_from_fully_qualified_class_name( 1797 full_qualified_class_name: str, 1798 ) -> Any: 1799 """Get a class from its fully qualified name. 1800 1801 If a custom components module is needed, we assume it is already registered - probably 1802 as `source_declarative_manifest.components` or `components`. 1803 1804 Args: 1805 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1806 1807 Returns: 1808 Any: The class object. 1809 1810 Raises: 1811 ValueError: If the class cannot be loaded. 1812 """ 1813 split = full_qualified_class_name.split(".") 1814 module_name_full = ".".join(split[:-1]) 1815 class_name = split[-1] 1816 1817 try: 1818 module_ref = importlib.import_module(module_name_full) 1819 except ModuleNotFoundError as e: 1820 if split[0] == "source_declarative_manifest": 1821 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1822 try: 1823 import os 1824 1825 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1826 module_ref = importlib.import_module( 1827 module_name_with_source_declarative_manifest 1828 ) 1829 except ModuleNotFoundError: 1830 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1831 else: 1832 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1833 1834 try: 1835 return getattr(module_ref, class_name) 1836 except AttributeError as e: 1837 raise ValueError( 1838 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1839 ) from e 1840 1841 @staticmethod 1842 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1843 interface = field_type 1844 while True: 1845 origin = get_origin(interface) 1846 if origin: 1847 # Unnest types until we reach the raw type 1848 # List[T] -> T 1849 # Optional[List[T]] -> T 1850 args = get_args(interface) 1851 interface = args[0] 1852 else: 1853 break 1854 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1855 return interface.__name__ 1856 return None 1857 1858 @staticmethod 1859 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1860 if not cls: 1861 return False 1862 return cls.__module__ == "builtins" 1863 1864 @staticmethod 1865 def _extract_missing_parameters(error: TypeError) -> List[str]: 1866 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1867 if parameter_search: 1868 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1869 else: 1870 return [] 1871 1872 def _create_nested_component( 1873 self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any 1874 ) -> Any: 1875 type_name = model_value.get("type", None) 1876 if not type_name: 1877 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1878 return model_value 1879 1880 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1881 if model_type: 1882 parsed_model = model_type.parse_obj(model_value) 1883 try: 1884 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1885 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1886 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1887 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1888 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1889 # are needed by a component and could not be shared. 1890 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1891 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1892 model_parameters = model_value.get("$parameters", {}) 1893 matching_parameters = { 1894 kwarg: model_parameters[kwarg] 1895 for kwarg in constructor_kwargs 1896 if kwarg in model_parameters 1897 } 1898 matching_kwargs = { 1899 kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs 1900 } 1901 return self._create_component_from_model( 1902 model=parsed_model, config=config, **(matching_parameters | matching_kwargs) 1903 ) 1904 except TypeError as error: 1905 missing_parameters = self._extract_missing_parameters(error) 1906 if missing_parameters: 1907 raise ValueError( 1908 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1909 + ", ".join( 1910 ( 1911 f"{type_name}.$parameters.{parameter}" 1912 for parameter in missing_parameters 1913 ) 1914 ) 1915 ) 1916 raise TypeError( 1917 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1918 ) 1919 else: 1920 raise ValueError( 1921 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1922 ) 1923 1924 @staticmethod 1925 def _is_component(model_value: Any) -> bool: 1926 return isinstance(model_value, dict) and model_value.get("type") is not None 1927 1928 def create_default_stream( 1929 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 1930 ) -> AbstractStream: 1931 primary_key = model.primary_key.__root__ if model.primary_key else None 1932 self._migrate_state(model, config) 1933 1934 partition_router = self._build_stream_slicer_from_partition_router( 1935 model.retriever, 1936 config, 1937 stream_name=model.name, 1938 **kwargs, 1939 ) 1940 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 1941 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1942 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 1943 1944 end_time_option = ( 1945 self._create_component_from_model( 1946 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1947 ) 1948 if cursor_model.end_time_option 1949 else None 1950 ) 1951 start_time_option = ( 1952 self._create_component_from_model( 1953 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1954 ) 1955 if cursor_model.start_time_option 1956 else None 1957 ) 1958 1959 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 1960 start_time_option=start_time_option, 1961 end_time_option=end_time_option, 1962 partition_field_start=cursor_model.partition_field_start, 1963 partition_field_end=cursor_model.partition_field_end, 1964 config=config, 1965 parameters=model.parameters or {}, 1966 ) 1967 request_options_provider = ( 1968 datetime_request_options_provider 1969 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 1970 else PerPartitionRequestOptionsProvider( 1971 partition_router, datetime_request_options_provider 1972 ) 1973 ) 1974 elif model.incremental_sync and isinstance( 1975 model.incremental_sync, IncrementingCountCursorModel 1976 ): 1977 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 1978 raise ValueError( 1979 "PerPartition does not support per partition states because switching to global state is time based" 1980 ) 1981 1982 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1983 1984 start_time_option = ( 1985 self._create_component_from_model( 1986 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1987 config, 1988 parameters=cursor_model.parameters or {}, 1989 ) 1990 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1991 else None 1992 ) 1993 1994 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1995 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1996 partition_field_start = "start" 1997 1998 request_options_provider = DatetimeBasedRequestOptionsProvider( 1999 start_time_option=start_time_option, 2000 partition_field_start=partition_field_start, 2001 config=config, 2002 parameters=model.parameters or {}, 2003 ) 2004 else: 2005 request_options_provider = None 2006 2007 transformations = [] 2008 if model.transformations: 2009 for transformation_model in model.transformations: 2010 transformations.append( 2011 self._create_component_from_model(model=transformation_model, config=config) 2012 ) 2013 file_uploader = None 2014 if model.file_uploader: 2015 file_uploader = self._create_component_from_model( 2016 model=model.file_uploader, config=config 2017 ) 2018 2019 stream_slicer: ConcurrentStreamSlicer = ( 2020 partition_router 2021 if isinstance(concurrent_cursor, FinalStateCursor) 2022 else concurrent_cursor 2023 ) 2024 2025 retriever = self._create_component_from_model( 2026 model=model.retriever, 2027 config=config, 2028 name=model.name, 2029 primary_key=primary_key, 2030 request_options_provider=request_options_provider, 2031 stream_slicer=stream_slicer, 2032 partition_router=partition_router, 2033 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2034 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2035 cursor=concurrent_cursor, 2036 transformations=transformations, 2037 file_uploader=file_uploader, 2038 incremental_sync=model.incremental_sync, 2039 ) 2040 if isinstance(retriever, AsyncRetriever): 2041 stream_slicer = retriever.stream_slicer 2042 2043 schema_loader: SchemaLoader 2044 if model.schema_loader and isinstance(model.schema_loader, list): 2045 nested_schema_loaders = [ 2046 self._create_component_from_model(model=nested_schema_loader, config=config) 2047 for nested_schema_loader in model.schema_loader 2048 ] 2049 schema_loader = CompositeSchemaLoader( 2050 schema_loaders=nested_schema_loaders, parameters={} 2051 ) 2052 elif model.schema_loader: 2053 schema_loader = self._create_component_from_model( 2054 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2055 config=config, 2056 ) 2057 else: 2058 options = model.parameters or {} 2059 if "name" not in options: 2060 options["name"] = model.name 2061 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2062 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2063 2064 stream_name = model.name or "" 2065 return DefaultStream( 2066 partition_generator=StreamSlicerPartitionGenerator( 2067 DeclarativePartitionFactory( 2068 stream_name, 2069 schema_loader, 2070 retriever, 2071 self._message_repository, 2072 ), 2073 stream_slicer, 2074 slice_limit=self._limit_slices_fetched, 2075 ), 2076 name=stream_name, 2077 json_schema=schema_loader.get_json_schema, 2078 primary_key=get_primary_key_from_stream(primary_key), 2079 cursor_field=concurrent_cursor.cursor_field.cursor_field_key 2080 if hasattr(concurrent_cursor, "cursor_field") 2081 else "", # FIXME we should have the cursor field has part of the interface of cursor, 2082 logger=logging.getLogger(f"airbyte.{stream_name}"), 2083 cursor=concurrent_cursor, 2084 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2085 ) 2086 2087 def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: 2088 stream_name = model.name or "" 2089 stream_state = self._connector_state_manager.get_stream_state( 2090 stream_name=stream_name, namespace=None 2091 ) 2092 if model.state_migrations: 2093 state_transformations = [ 2094 self._create_component_from_model(state_migration, config, declarative_stream=model) 2095 for state_migration in model.state_migrations 2096 ] 2097 else: 2098 state_transformations = [] 2099 stream_state = self.apply_stream_state_migrations(state_transformations, stream_state) 2100 self._connector_state_manager.update_state_for_stream( 2101 stream_name=stream_name, namespace=None, value=stream_state 2102 ) 2103 2104 def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: 2105 return bool( 2106 model.incremental_sync 2107 and hasattr(model.incremental_sync, "is_data_feed") 2108 and model.incremental_sync.is_data_feed 2109 ) 2110 2111 def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: 2112 return bool( 2113 model.incremental_sync 2114 and hasattr(model.incremental_sync, "is_client_side_incremental") 2115 and model.incremental_sync.is_client_side_incremental 2116 ) 2117 2118 def _build_stream_slicer_from_partition_router( 2119 self, 2120 model: Union[ 2121 AsyncRetrieverModel, 2122 CustomRetrieverModel, 2123 SimpleRetrieverModel, 2124 ], 2125 config: Config, 2126 stream_name: Optional[str] = None, 2127 **kwargs: Any, 2128 ) -> PartitionRouter: 2129 if ( 2130 hasattr(model, "partition_router") 2131 and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel)) 2132 and model.partition_router 2133 ): 2134 stream_slicer_model = model.partition_router 2135 if isinstance(stream_slicer_model, list): 2136 return CartesianProductStreamSlicer( 2137 [ 2138 self._create_component_from_model( 2139 model=slicer, config=config, stream_name=stream_name or "" 2140 ) 2141 for slicer in stream_slicer_model 2142 ], 2143 parameters={}, 2144 ) 2145 elif isinstance(stream_slicer_model, dict): 2146 # partition router comes from CustomRetrieverModel therefore has not been parsed as a model 2147 params = stream_slicer_model.get("$parameters") 2148 if not isinstance(params, dict): 2149 params = {} 2150 stream_slicer_model["$parameters"] = params 2151 2152 if stream_name is not None: 2153 params["stream_name"] = stream_name 2154 2155 return self._create_nested_component( # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices` 2156 model, 2157 "partition_router", 2158 stream_slicer_model, 2159 config, 2160 **kwargs, 2161 ) 2162 else: 2163 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2164 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2165 ) 2166 return SinglePartitionRouter(parameters={}) 2167 2168 def _build_concurrent_cursor( 2169 self, 2170 model: DeclarativeStreamModel, 2171 stream_slicer: Optional[PartitionRouter], 2172 config: Config, 2173 ) -> Cursor: 2174 stream_name = model.name or "" 2175 stream_state = self._connector_state_manager.get_stream_state(stream_name, None) 2176 2177 if ( 2178 model.incremental_sync 2179 and stream_slicer 2180 and not isinstance(stream_slicer, SinglePartitionRouter) 2181 ): 2182 if isinstance(model.incremental_sync, IncrementingCountCursorModel): 2183 # We don't currently support usage of partition routing and IncrementingCountCursor at the 2184 # same time because we didn't solve for design questions like what the lookback window would 2185 # be as well as global cursor fall backs. We have not seen customers that have needed both 2186 # at the same time yet and are currently punting on this until we need to solve it. 2187 raise ValueError( 2188 f"The low-code framework does not currently support usage of a PartitionRouter and an IncrementingCountCursor at the same time. Please specify only one of these options for stream {stream_name}." 2189 ) 2190 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2191 state_manager=self._connector_state_manager, 2192 model_type=DatetimeBasedCursorModel, 2193 component_definition=model.incremental_sync.__dict__, 2194 stream_name=stream_name, 2195 stream_state=stream_state, 2196 stream_namespace=None, 2197 config=config or {}, 2198 partition_router=stream_slicer, 2199 attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? 2200 ) 2201 elif model.incremental_sync: 2202 if type(model.incremental_sync) == IncrementingCountCursorModel: 2203 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2204 model_type=IncrementingCountCursorModel, 2205 component_definition=model.incremental_sync.__dict__, 2206 stream_name=stream_name, 2207 stream_namespace=None, 2208 stream_state=stream_state, 2209 config=config or {}, 2210 ) 2211 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2212 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2213 model_type=type(model.incremental_sync), 2214 component_definition=model.incremental_sync.__dict__, 2215 stream_name=stream_name, 2216 stream_namespace=None, 2217 stream_state=stream_state, 2218 config=config or {}, 2219 attempt_to_create_cursor_if_not_provided=True, 2220 ) 2221 else: 2222 raise ValueError( 2223 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2224 ) 2225 return FinalStateCursor(stream_name, None, self._message_repository) 2226 2227 def create_default_error_handler( 2228 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2229 ) -> DefaultErrorHandler: 2230 backoff_strategies = [] 2231 if model.backoff_strategies: 2232 for backoff_strategy_model in model.backoff_strategies: 2233 backoff_strategies.append( 2234 self._create_component_from_model(model=backoff_strategy_model, config=config) 2235 ) 2236 2237 response_filters = [] 2238 if model.response_filters: 2239 for response_filter_model in model.response_filters: 2240 response_filters.append( 2241 self._create_component_from_model(model=response_filter_model, config=config) 2242 ) 2243 response_filters.append( 2244 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2245 ) 2246 2247 return DefaultErrorHandler( 2248 backoff_strategies=backoff_strategies, 2249 max_retries=model.max_retries, 2250 response_filters=response_filters, 2251 config=config, 2252 parameters=model.parameters or {}, 2253 ) 2254 2255 def create_default_paginator( 2256 self, 2257 model: DefaultPaginatorModel, 2258 config: Config, 2259 *, 2260 url_base: str, 2261 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2262 decoder: Optional[Decoder] = None, 2263 cursor_used_for_stop_condition: Optional[Cursor] = None, 2264 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2265 if decoder: 2266 if self._is_supported_decoder_for_pagination(decoder): 2267 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2268 else: 2269 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2270 else: 2271 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2272 page_size_option = ( 2273 self._create_component_from_model(model=model.page_size_option, config=config) 2274 if model.page_size_option 2275 else None 2276 ) 2277 page_token_option = ( 2278 self._create_component_from_model(model=model.page_token_option, config=config) 2279 if model.page_token_option 2280 else None 2281 ) 2282 pagination_strategy = self._create_component_from_model( 2283 model=model.pagination_strategy, 2284 config=config, 2285 decoder=decoder_to_use, 2286 extractor_model=extractor_model, 2287 ) 2288 if cursor_used_for_stop_condition: 2289 pagination_strategy = StopConditionPaginationStrategyDecorator( 2290 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2291 ) 2292 paginator = DefaultPaginator( 2293 decoder=decoder_to_use, 2294 page_size_option=page_size_option, 2295 page_token_option=page_token_option, 2296 pagination_strategy=pagination_strategy, 2297 url_base=url_base, 2298 config=config, 2299 parameters=model.parameters or {}, 2300 ) 2301 if self._limit_pages_fetched_per_slice: 2302 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2303 return paginator 2304 2305 def create_dpath_extractor( 2306 self, 2307 model: DpathExtractorModel, 2308 config: Config, 2309 decoder: Optional[Decoder] = None, 2310 **kwargs: Any, 2311 ) -> DpathExtractor: 2312 if decoder: 2313 decoder_to_use = decoder 2314 else: 2315 decoder_to_use = JsonDecoder(parameters={}) 2316 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2317 return DpathExtractor( 2318 decoder=decoder_to_use, 2319 field_path=model_field_path, 2320 config=config, 2321 parameters=model.parameters or {}, 2322 ) 2323 2324 @staticmethod 2325 def create_response_to_file_extractor( 2326 model: ResponseToFileExtractorModel, 2327 **kwargs: Any, 2328 ) -> ResponseToFileExtractor: 2329 return ResponseToFileExtractor(parameters=model.parameters or {}) 2330 2331 @staticmethod 2332 def create_exponential_backoff_strategy( 2333 model: ExponentialBackoffStrategyModel, config: Config 2334 ) -> ExponentialBackoffStrategy: 2335 return ExponentialBackoffStrategy( 2336 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2337 ) 2338 2339 @staticmethod 2340 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2341 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2342 2343 def create_http_requester( 2344 self, 2345 model: HttpRequesterModel, 2346 config: Config, 2347 decoder: Decoder = JsonDecoder(parameters={}), 2348 query_properties_key: Optional[str] = None, 2349 use_cache: Optional[bool] = None, 2350 *, 2351 name: str, 2352 ) -> HttpRequester: 2353 authenticator = ( 2354 self._create_component_from_model( 2355 model=model.authenticator, 2356 config=config, 2357 url_base=model.url or model.url_base, 2358 name=name, 2359 decoder=decoder, 2360 ) 2361 if model.authenticator 2362 else None 2363 ) 2364 error_handler = ( 2365 self._create_component_from_model(model=model.error_handler, config=config) 2366 if model.error_handler 2367 else DefaultErrorHandler( 2368 backoff_strategies=[], 2369 response_filters=[], 2370 config=config, 2371 parameters=model.parameters or {}, 2372 ) 2373 ) 2374 2375 api_budget = self._api_budget 2376 2377 request_options_provider = InterpolatedRequestOptionsProvider( 2378 request_body=model.request_body, 2379 request_body_data=model.request_body_data, 2380 request_body_json=model.request_body_json, 2381 request_headers=model.request_headers, 2382 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2383 query_properties_key=query_properties_key, 2384 config=config, 2385 parameters=model.parameters or {}, 2386 ) 2387 2388 assert model.use_cache is not None # for mypy 2389 assert model.http_method is not None # for mypy 2390 2391 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2392 2393 return HttpRequester( 2394 name=name, 2395 url=model.url, 2396 url_base=model.url_base, 2397 path=model.path, 2398 authenticator=authenticator, 2399 error_handler=error_handler, 2400 api_budget=api_budget, 2401 http_method=HttpMethod[model.http_method.value], 2402 request_options_provider=request_options_provider, 2403 config=config, 2404 disable_retries=self._disable_retries, 2405 parameters=model.parameters or {}, 2406 message_repository=self._message_repository, 2407 use_cache=should_use_cache, 2408 decoder=decoder, 2409 stream_response=decoder.is_stream_response() if decoder else False, 2410 ) 2411 2412 @staticmethod 2413 def create_http_response_filter( 2414 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2415 ) -> HttpResponseFilter: 2416 if model.action: 2417 action = ResponseAction(model.action.value) 2418 else: 2419 action = None 2420 2421 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2422 2423 http_codes = ( 2424 set(model.http_codes) if model.http_codes else set() 2425 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2426 2427 return HttpResponseFilter( 2428 action=action, 2429 failure_type=failure_type, 2430 error_message=model.error_message or "", 2431 error_message_contains=model.error_message_contains or "", 2432 http_codes=http_codes, 2433 predicate=model.predicate or "", 2434 config=config, 2435 parameters=model.parameters or {}, 2436 ) 2437 2438 @staticmethod 2439 def create_inline_schema_loader( 2440 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2441 ) -> InlineSchemaLoader: 2442 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2443 2444 def create_complex_field_type( 2445 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2446 ) -> ComplexFieldType: 2447 items = ( 2448 self._create_component_from_model(model=model.items, config=config) 2449 if isinstance(model.items, ComplexFieldTypeModel) 2450 else model.items 2451 ) 2452 2453 return ComplexFieldType(field_type=model.field_type, items=items) 2454 2455 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2456 target_type = ( 2457 self._create_component_from_model(model=model.target_type, config=config) 2458 if isinstance(model.target_type, ComplexFieldTypeModel) 2459 else model.target_type 2460 ) 2461 2462 return TypesMap( 2463 target_type=target_type, 2464 current_type=model.current_type, 2465 condition=model.condition if model.condition is not None else "True", 2466 ) 2467 2468 def create_schema_type_identifier( 2469 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2470 ) -> SchemaTypeIdentifier: 2471 types_mapping = [] 2472 if model.types_mapping: 2473 types_mapping.extend( 2474 [ 2475 self._create_component_from_model(types_map, config=config) 2476 for types_map in model.types_mapping 2477 ] 2478 ) 2479 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2480 [x for x in model.schema_pointer] if model.schema_pointer else [] 2481 ) 2482 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2483 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2484 [x for x in model.type_pointer] if model.type_pointer else None 2485 ) 2486 2487 return SchemaTypeIdentifier( 2488 schema_pointer=model_schema_pointer, 2489 key_pointer=model_key_pointer, 2490 type_pointer=model_type_pointer, 2491 types_mapping=types_mapping, 2492 parameters=model.parameters or {}, 2493 ) 2494 2495 def create_dynamic_schema_loader( 2496 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2497 ) -> DynamicSchemaLoader: 2498 schema_transformations = [] 2499 if model.schema_transformations: 2500 for transformation_model in model.schema_transformations: 2501 schema_transformations.append( 2502 self._create_component_from_model(model=transformation_model, config=config) 2503 ) 2504 name = "dynamic_properties" 2505 retriever = self._create_component_from_model( 2506 model=model.retriever, 2507 config=config, 2508 name=name, 2509 primary_key=None, 2510 partition_router=self._build_stream_slicer_from_partition_router( 2511 model.retriever, config 2512 ), 2513 transformations=[], 2514 use_cache=True, 2515 log_formatter=( 2516 lambda response: format_http_message( 2517 response, 2518 f"Schema loader '{name}' request", 2519 f"Request performed in order to extract schema.", 2520 name, 2521 is_auxiliary=True, 2522 ) 2523 ), 2524 ) 2525 schema_type_identifier = self._create_component_from_model( 2526 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2527 ) 2528 schema_filter = ( 2529 self._create_component_from_model( 2530 model.schema_filter, config=config, parameters=model.parameters or {} 2531 ) 2532 if model.schema_filter is not None 2533 else None 2534 ) 2535 2536 return DynamicSchemaLoader( 2537 retriever=retriever, 2538 config=config, 2539 schema_transformations=schema_transformations, 2540 schema_filter=schema_filter, 2541 schema_type_identifier=schema_type_identifier, 2542 parameters=model.parameters or {}, 2543 ) 2544 2545 @staticmethod 2546 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2547 return JsonDecoder(parameters={}) 2548 2549 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2550 return CompositeRawDecoder( 2551 parser=ModelToComponentFactory._get_parser(model, config), 2552 stream_response=False if self._emit_connector_builder_messages else True, 2553 ) 2554 2555 def create_jsonl_decoder( 2556 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2557 ) -> Decoder: 2558 return CompositeRawDecoder( 2559 parser=ModelToComponentFactory._get_parser(model, config), 2560 stream_response=False if self._emit_connector_builder_messages else True, 2561 ) 2562 2563 def create_gzip_decoder( 2564 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2565 ) -> Decoder: 2566 _compressed_response_types = { 2567 "gzip", 2568 "x-gzip", 2569 "gzip, deflate", 2570 "x-gzip, deflate", 2571 "application/zip", 2572 "application/gzip", 2573 "application/x-gzip", 2574 "application/x-zip-compressed", 2575 } 2576 2577 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2578 2579 if self._emit_connector_builder_messages: 2580 # This is very surprising but if the response is not streamed, 2581 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2582 # which uses urllib3 directly and does not uncompress the data. 2583 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2584 2585 return CompositeRawDecoder.by_headers( 2586 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2587 stream_response=True, 2588 fallback_parser=gzip_parser.inner_parser, 2589 ) 2590 2591 @staticmethod 2592 def create_iterable_decoder( 2593 model: IterableDecoderModel, config: Config, **kwargs: Any 2594 ) -> IterableDecoder: 2595 return IterableDecoder(parameters={}) 2596 2597 @staticmethod 2598 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2599 return XmlDecoder(parameters={}) 2600 2601 def create_zipfile_decoder( 2602 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2603 ) -> ZipfileDecoder: 2604 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2605 2606 @staticmethod 2607 def _get_parser(model: BaseModel, config: Config) -> Parser: 2608 if isinstance(model, JsonDecoderModel): 2609 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2610 return JsonParser() 2611 elif isinstance(model, JsonlDecoderModel): 2612 return JsonLineParser() 2613 elif isinstance(model, CsvDecoderModel): 2614 return CsvParser( 2615 encoding=model.encoding, 2616 delimiter=model.delimiter, 2617 set_values_to_none=model.set_values_to_none, 2618 ) 2619 elif isinstance(model, GzipDecoderModel): 2620 return GzipParser( 2621 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2622 ) 2623 elif isinstance( 2624 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2625 ): 2626 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2627 2628 raise ValueError(f"Unknown decoder type {model}") 2629 2630 @staticmethod 2631 def create_json_file_schema_loader( 2632 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2633 ) -> JsonFileSchemaLoader: 2634 return JsonFileSchemaLoader( 2635 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2636 ) 2637 2638 def create_jwt_authenticator( 2639 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2640 ) -> JwtAuthenticator: 2641 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2642 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2643 request_option = ( 2644 self._create_component_from_model(model.request_option, config) 2645 if model.request_option 2646 else None 2647 ) 2648 return JwtAuthenticator( 2649 config=config, 2650 parameters=model.parameters or {}, 2651 algorithm=JwtAlgorithm(model.algorithm.value), 2652 secret_key=model.secret_key, 2653 base64_encode_secret_key=model.base64_encode_secret_key, 2654 token_duration=model.token_duration, 2655 header_prefix=model.header_prefix, 2656 kid=jwt_headers.kid, 2657 typ=jwt_headers.typ, 2658 cty=jwt_headers.cty, 2659 iss=jwt_payload.iss, 2660 sub=jwt_payload.sub, 2661 aud=jwt_payload.aud, 2662 additional_jwt_headers=model.additional_jwt_headers, 2663 additional_jwt_payload=model.additional_jwt_payload, 2664 passphrase=model.passphrase, 2665 request_option=request_option, 2666 ) 2667 2668 def create_list_partition_router( 2669 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2670 ) -> ListPartitionRouter: 2671 request_option = ( 2672 self._create_component_from_model(model.request_option, config) 2673 if model.request_option 2674 else None 2675 ) 2676 return ListPartitionRouter( 2677 cursor_field=model.cursor_field, 2678 request_option=request_option, 2679 values=model.values, 2680 config=config, 2681 parameters=model.parameters or {}, 2682 ) 2683 2684 @staticmethod 2685 def create_min_max_datetime( 2686 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2687 ) -> MinMaxDatetime: 2688 return MinMaxDatetime( 2689 datetime=model.datetime, 2690 datetime_format=model.datetime_format or "", 2691 max_datetime=model.max_datetime or "", 2692 min_datetime=model.min_datetime or "", 2693 parameters=model.parameters or {}, 2694 ) 2695 2696 @staticmethod 2697 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2698 return NoAuth(parameters=model.parameters or {}) 2699 2700 @staticmethod 2701 def create_no_pagination( 2702 model: NoPaginationModel, config: Config, **kwargs: Any 2703 ) -> NoPagination: 2704 return NoPagination(parameters={}) 2705 2706 def create_oauth_authenticator( 2707 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2708 ) -> DeclarativeOauth2Authenticator: 2709 profile_assertion = ( 2710 self._create_component_from_model(model.profile_assertion, config=config) 2711 if model.profile_assertion 2712 else None 2713 ) 2714 2715 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2716 self._get_refresh_token_error_information(model) 2717 ) 2718 if model.refresh_token_updater: 2719 # ignore type error because fixing it would have a lot of dependencies, revisit later 2720 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2721 config, 2722 InterpolatedString.create( 2723 model.token_refresh_endpoint, # type: ignore 2724 parameters=model.parameters or {}, 2725 ).eval(config), 2726 access_token_name=InterpolatedString.create( 2727 model.access_token_name or "access_token", parameters=model.parameters or {} 2728 ).eval(config), 2729 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2730 expires_in_name=InterpolatedString.create( 2731 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2732 ).eval(config), 2733 client_id_name=InterpolatedString.create( 2734 model.client_id_name or "client_id", parameters=model.parameters or {} 2735 ).eval(config), 2736 client_id=InterpolatedString.create( 2737 model.client_id, parameters=model.parameters or {} 2738 ).eval(config) 2739 if model.client_id 2740 else model.client_id, 2741 client_secret_name=InterpolatedString.create( 2742 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2743 ).eval(config), 2744 client_secret=InterpolatedString.create( 2745 model.client_secret, parameters=model.parameters or {} 2746 ).eval(config) 2747 if model.client_secret 2748 else model.client_secret, 2749 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2750 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2751 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2752 grant_type_name=InterpolatedString.create( 2753 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2754 ).eval(config), 2755 grant_type=InterpolatedString.create( 2756 model.grant_type or "refresh_token", parameters=model.parameters or {} 2757 ).eval(config), 2758 refresh_request_body=InterpolatedMapping( 2759 model.refresh_request_body or {}, parameters=model.parameters or {} 2760 ).eval(config), 2761 refresh_request_headers=InterpolatedMapping( 2762 model.refresh_request_headers or {}, parameters=model.parameters or {} 2763 ).eval(config), 2764 scopes=model.scopes, 2765 token_expiry_date_format=model.token_expiry_date_format, 2766 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2767 message_repository=self._message_repository, 2768 refresh_token_error_status_codes=refresh_token_error_status_codes, 2769 refresh_token_error_key=refresh_token_error_key, 2770 refresh_token_error_values=refresh_token_error_values, 2771 ) 2772 # ignore type error because fixing it would have a lot of dependencies, revisit later 2773 return DeclarativeOauth2Authenticator( # type: ignore 2774 access_token_name=model.access_token_name or "access_token", 2775 access_token_value=model.access_token_value, 2776 client_id_name=model.client_id_name or "client_id", 2777 client_id=model.client_id, 2778 client_secret_name=model.client_secret_name or "client_secret", 2779 client_secret=model.client_secret, 2780 expires_in_name=model.expires_in_name or "expires_in", 2781 grant_type_name=model.grant_type_name or "grant_type", 2782 grant_type=model.grant_type or "refresh_token", 2783 refresh_request_body=model.refresh_request_body, 2784 refresh_request_headers=model.refresh_request_headers, 2785 refresh_token_name=model.refresh_token_name or "refresh_token", 2786 refresh_token=model.refresh_token, 2787 scopes=model.scopes, 2788 token_expiry_date=model.token_expiry_date, 2789 token_expiry_date_format=model.token_expiry_date_format, 2790 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2791 token_refresh_endpoint=model.token_refresh_endpoint, 2792 config=config, 2793 parameters=model.parameters or {}, 2794 message_repository=self._message_repository, 2795 profile_assertion=profile_assertion, 2796 use_profile_assertion=model.use_profile_assertion, 2797 refresh_token_error_status_codes=refresh_token_error_status_codes, 2798 refresh_token_error_key=refresh_token_error_key, 2799 refresh_token_error_values=refresh_token_error_values, 2800 ) 2801 2802 @staticmethod 2803 def _get_refresh_token_error_information( 2804 model: OAuthAuthenticatorModel, 2805 ) -> Tuple[Tuple[int, ...], str, Tuple[str, ...]]: 2806 """ 2807 In a previous version of the CDK, the auth error as config_error was only done if a refresh token updater was 2808 defined. As a transition, we added those fields on the OAuthAuthenticatorModel. This method ensures that the 2809 information is defined only once and return the right fields. 2810 """ 2811 refresh_token_updater = model.refresh_token_updater 2812 is_defined_on_refresh_token_updated = refresh_token_updater and ( 2813 refresh_token_updater.refresh_token_error_status_codes 2814 or refresh_token_updater.refresh_token_error_key 2815 or refresh_token_updater.refresh_token_error_values 2816 ) 2817 is_defined_on_oauth_authenticator = ( 2818 model.refresh_token_error_status_codes 2819 or model.refresh_token_error_key 2820 or model.refresh_token_error_values 2821 ) 2822 if is_defined_on_refresh_token_updated and is_defined_on_oauth_authenticator: 2823 raise ValueError( 2824 "refresh_token_error should either be defined on the OAuthAuthenticatorModel or the RefreshTokenUpdaterModel, not both" 2825 ) 2826 2827 if is_defined_on_refresh_token_updated: 2828 not_optional_refresh_token_updater: RefreshTokenUpdaterModel = refresh_token_updater # type: ignore # we know from the condition that this is not None 2829 return ( 2830 tuple(not_optional_refresh_token_updater.refresh_token_error_status_codes) 2831 if not_optional_refresh_token_updater.refresh_token_error_status_codes 2832 else (), 2833 not_optional_refresh_token_updater.refresh_token_error_key or "", 2834 tuple(not_optional_refresh_token_updater.refresh_token_error_values) 2835 if not_optional_refresh_token_updater.refresh_token_error_values 2836 else (), 2837 ) 2838 elif is_defined_on_oauth_authenticator: 2839 return ( 2840 tuple(model.refresh_token_error_status_codes) 2841 if model.refresh_token_error_status_codes 2842 else (), 2843 model.refresh_token_error_key or "", 2844 tuple(model.refresh_token_error_values) if model.refresh_token_error_values else (), 2845 ) 2846 2847 # returning default values we think cover most cases 2848 return (400,), "error", ("invalid_grant", "invalid_permissions") 2849 2850 def create_offset_increment( 2851 self, 2852 model: OffsetIncrementModel, 2853 config: Config, 2854 decoder: Decoder, 2855 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2856 **kwargs: Any, 2857 ) -> OffsetIncrement: 2858 if isinstance(decoder, PaginationDecoderDecorator): 2859 inner_decoder = decoder.decoder 2860 else: 2861 inner_decoder = decoder 2862 decoder = PaginationDecoderDecorator(decoder=decoder) 2863 2864 if self._is_supported_decoder_for_pagination(inner_decoder): 2865 decoder_to_use = decoder 2866 else: 2867 raise ValueError( 2868 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2869 ) 2870 2871 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2872 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2873 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2874 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2875 # When we have more time to investigate we can look into reusing the same component. 2876 extractor = ( 2877 self._create_component_from_model( 2878 model=extractor_model, config=config, decoder=decoder_to_use 2879 ) 2880 if extractor_model 2881 else None 2882 ) 2883 2884 return OffsetIncrement( 2885 page_size=model.page_size, 2886 config=config, 2887 decoder=decoder_to_use, 2888 extractor=extractor, 2889 inject_on_first_request=model.inject_on_first_request or False, 2890 parameters=model.parameters or {}, 2891 ) 2892 2893 @staticmethod 2894 def create_page_increment( 2895 model: PageIncrementModel, config: Config, **kwargs: Any 2896 ) -> PageIncrement: 2897 return PageIncrement( 2898 page_size=model.page_size, 2899 config=config, 2900 start_from_page=model.start_from_page or 0, 2901 inject_on_first_request=model.inject_on_first_request or False, 2902 parameters=model.parameters or {}, 2903 ) 2904 2905 def create_parent_stream_config( 2906 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 2907 ) -> ParentStreamConfig: 2908 declarative_stream = self._create_component_from_model( 2909 model.stream, 2910 config=config, 2911 is_parent=True, 2912 **kwargs, 2913 ) 2914 request_option = ( 2915 self._create_component_from_model(model.request_option, config=config) 2916 if model.request_option 2917 else None 2918 ) 2919 2920 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2921 raise ValueError( 2922 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2923 ) 2924 2925 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2926 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2927 ) 2928 2929 return ParentStreamConfig( 2930 parent_key=model.parent_key, 2931 request_option=request_option, 2932 stream=declarative_stream, 2933 partition_field=model.partition_field, 2934 config=config, 2935 incremental_dependency=model.incremental_dependency or False, 2936 parameters=model.parameters or {}, 2937 extra_fields=model.extra_fields, 2938 lazy_read_pointer=model_lazy_read_pointer, 2939 ) 2940 2941 def create_properties_from_endpoint( 2942 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2943 ) -> PropertiesFromEndpoint: 2944 retriever = self._create_component_from_model( 2945 model=model.retriever, 2946 config=config, 2947 name="dynamic_properties", 2948 primary_key=None, 2949 stream_slicer=None, 2950 transformations=[], 2951 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2952 ) 2953 return PropertiesFromEndpoint( 2954 property_field_path=model.property_field_path, 2955 retriever=retriever, 2956 config=config, 2957 parameters=model.parameters or {}, 2958 ) 2959 2960 def create_property_chunking( 2961 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2962 ) -> PropertyChunking: 2963 record_merge_strategy = ( 2964 self._create_component_from_model( 2965 model=model.record_merge_strategy, config=config, **kwargs 2966 ) 2967 if model.record_merge_strategy 2968 else None 2969 ) 2970 2971 property_limit_type: PropertyLimitType 2972 match model.property_limit_type: 2973 case PropertyLimitTypeModel.property_count: 2974 property_limit_type = PropertyLimitType.property_count 2975 case PropertyLimitTypeModel.characters: 2976 property_limit_type = PropertyLimitType.characters 2977 case _: 2978 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2979 2980 return PropertyChunking( 2981 property_limit_type=property_limit_type, 2982 property_limit=model.property_limit, 2983 record_merge_strategy=record_merge_strategy, 2984 config=config, 2985 parameters=model.parameters or {}, 2986 ) 2987 2988 def create_query_properties( 2989 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 2990 ) -> QueryProperties: 2991 if isinstance(model.property_list, list): 2992 property_list = model.property_list 2993 else: 2994 property_list = self._create_component_from_model( 2995 model=model.property_list, config=config, **kwargs 2996 ) 2997 2998 property_chunking = ( 2999 self._create_component_from_model( 3000 model=model.property_chunking, config=config, **kwargs 3001 ) 3002 if model.property_chunking 3003 else None 3004 ) 3005 3006 property_selector = ( 3007 self._create_component_from_model( 3008 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3009 ) 3010 if model.property_selector 3011 else None 3012 ) 3013 3014 return QueryProperties( 3015 property_list=property_list, 3016 always_include_properties=model.always_include_properties, 3017 property_chunking=property_chunking, 3018 property_selector=property_selector, 3019 config=config, 3020 parameters=model.parameters or {}, 3021 ) 3022 3023 def create_json_schema_property_selector( 3024 self, 3025 model: JsonSchemaPropertySelectorModel, 3026 config: Config, 3027 *, 3028 stream_name: str, 3029 **kwargs: Any, 3030 ) -> JsonSchemaPropertySelector: 3031 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3032 3033 transformations = [] 3034 if model.transformations: 3035 for transformation_model in model.transformations: 3036 transformations.append( 3037 self._create_component_from_model(model=transformation_model, config=config) 3038 ) 3039 3040 return JsonSchemaPropertySelector( 3041 configured_stream=configured_stream, 3042 properties_transformations=transformations, 3043 config=config, 3044 parameters=model.parameters or {}, 3045 ) 3046 3047 @staticmethod 3048 def create_record_filter( 3049 model: RecordFilterModel, config: Config, **kwargs: Any 3050 ) -> RecordFilter: 3051 return RecordFilter( 3052 condition=model.condition or "", config=config, parameters=model.parameters or {} 3053 ) 3054 3055 @staticmethod 3056 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3057 return RequestPath(parameters={}) 3058 3059 @staticmethod 3060 def create_request_option( 3061 model: RequestOptionModel, config: Config, **kwargs: Any 3062 ) -> RequestOption: 3063 inject_into = RequestOptionType(model.inject_into.value) 3064 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3065 [ 3066 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3067 for segment in model.field_path 3068 ] 3069 if model.field_path 3070 else None 3071 ) 3072 field_name = ( 3073 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3074 if model.field_name 3075 else None 3076 ) 3077 return RequestOption( 3078 field_name=field_name, 3079 field_path=field_path, 3080 inject_into=inject_into, 3081 parameters=kwargs.get("parameters", {}), 3082 ) 3083 3084 def create_record_selector( 3085 self, 3086 model: RecordSelectorModel, 3087 config: Config, 3088 *, 3089 name: str, 3090 transformations: List[RecordTransformation] | None = None, 3091 decoder: Decoder | None = None, 3092 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3093 file_uploader: Optional[DefaultFileUploader] = None, 3094 **kwargs: Any, 3095 ) -> RecordSelector: 3096 extractor = self._create_component_from_model( 3097 model=model.extractor, decoder=decoder, config=config 3098 ) 3099 record_filter = ( 3100 self._create_component_from_model(model.record_filter, config=config) 3101 if model.record_filter 3102 else None 3103 ) 3104 3105 transform_before_filtering = ( 3106 False if model.transform_before_filtering is None else model.transform_before_filtering 3107 ) 3108 if client_side_incremental_sync_cursor: 3109 record_filter = ClientSideIncrementalRecordFilterDecorator( 3110 config=config, 3111 parameters=model.parameters, 3112 condition=model.record_filter.condition 3113 if (model.record_filter and hasattr(model.record_filter, "condition")) 3114 else None, 3115 cursor=client_side_incremental_sync_cursor, 3116 ) 3117 transform_before_filtering = ( 3118 True 3119 if model.transform_before_filtering is None 3120 else model.transform_before_filtering 3121 ) 3122 3123 if model.schema_normalization is None: 3124 # default to no schema normalization if not set 3125 model.schema_normalization = SchemaNormalizationModel.None_ 3126 3127 schema_normalization = ( 3128 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3129 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3130 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3131 ) 3132 3133 return RecordSelector( 3134 extractor=extractor, 3135 name=name, 3136 config=config, 3137 record_filter=record_filter, 3138 transformations=transformations or [], 3139 file_uploader=file_uploader, 3140 schema_normalization=schema_normalization, 3141 parameters=model.parameters or {}, 3142 transform_before_filtering=transform_before_filtering, 3143 ) 3144 3145 @staticmethod 3146 def create_remove_fields( 3147 model: RemoveFieldsModel, config: Config, **kwargs: Any 3148 ) -> RemoveFields: 3149 return RemoveFields( 3150 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3151 ) 3152 3153 def create_selective_authenticator( 3154 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3155 ) -> DeclarativeAuthenticator: 3156 authenticators = { 3157 name: self._create_component_from_model(model=auth, config=config) 3158 for name, auth in model.authenticators.items() 3159 } 3160 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3161 return SelectiveAuthenticator( # type: ignore[abstract] 3162 config=config, 3163 authenticators=authenticators, 3164 authenticator_selection_path=model.authenticator_selection_path, 3165 **kwargs, 3166 ) 3167 3168 @staticmethod 3169 def create_legacy_session_token_authenticator( 3170 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3171 ) -> LegacySessionTokenAuthenticator: 3172 return LegacySessionTokenAuthenticator( 3173 api_url=url_base, 3174 header=model.header, 3175 login_url=model.login_url, 3176 password=model.password or "", 3177 session_token=model.session_token or "", 3178 session_token_response_key=model.session_token_response_key or "", 3179 username=model.username or "", 3180 validate_session_url=model.validate_session_url, 3181 config=config, 3182 parameters=model.parameters or {}, 3183 ) 3184 3185 def create_simple_retriever( 3186 self, 3187 model: SimpleRetrieverModel, 3188 config: Config, 3189 *, 3190 name: str, 3191 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3192 request_options_provider: Optional[RequestOptionsProvider] = None, 3193 cursor: Optional[Cursor] = None, 3194 has_stop_condition_cursor: bool = False, 3195 is_client_side_incremental_sync: bool = False, 3196 transformations: List[RecordTransformation], 3197 file_uploader: Optional[DefaultFileUploader] = None, 3198 incremental_sync: Optional[ 3199 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3200 ] = None, 3201 use_cache: Optional[bool] = None, 3202 log_formatter: Optional[Callable[[Response], Any]] = None, 3203 partition_router: Optional[PartitionRouter] = None, 3204 **kwargs: Any, 3205 ) -> SimpleRetriever: 3206 def _get_url(req: Requester) -> str: 3207 """ 3208 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3209 This is needed because the URL is not set until the requester is created. 3210 """ 3211 3212 _url: str = ( 3213 model.requester.url 3214 if hasattr(model.requester, "url") and model.requester.url is not None 3215 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3216 ) 3217 _url_base: str = ( 3218 model.requester.url_base 3219 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3220 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3221 ) 3222 3223 return _url or _url_base 3224 3225 if cursor is None: 3226 cursor = FinalStateCursor(name, None, self._message_repository) 3227 3228 decoder = ( 3229 self._create_component_from_model(model=model.decoder, config=config) 3230 if model.decoder 3231 else JsonDecoder(parameters={}) 3232 ) 3233 record_selector = self._create_component_from_model( 3234 model=model.record_selector, 3235 name=name, 3236 config=config, 3237 decoder=decoder, 3238 transformations=transformations, 3239 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3240 file_uploader=file_uploader, 3241 ) 3242 3243 query_properties: Optional[QueryProperties] = None 3244 query_properties_key: Optional[str] = None 3245 self._ensure_query_properties_to_model(model.requester) 3246 if self._has_query_properties_in_request_parameters(model.requester): 3247 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3248 # places instead of default to request_parameters which isn't clearly documented 3249 if ( 3250 hasattr(model.requester, "fetch_properties_from_endpoint") 3251 and model.requester.fetch_properties_from_endpoint 3252 ): 3253 raise ValueError( 3254 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3255 ) 3256 3257 query_properties_definitions = [] 3258 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3259 if isinstance(request_parameter, QueryPropertiesModel): 3260 query_properties_key = key 3261 query_properties_definitions.append(request_parameter) 3262 3263 if len(query_properties_definitions) > 1: 3264 raise ValueError( 3265 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3266 ) 3267 3268 if len(query_properties_definitions) == 1: 3269 query_properties = self._create_component_from_model( 3270 model=query_properties_definitions[0], stream_name=name, config=config 3271 ) 3272 3273 # Removes QueryProperties components from the interpolated mappings because it has been designed 3274 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3275 # instead of through jinja interpolation 3276 if hasattr(model.requester, "request_parameters") and isinstance( 3277 model.requester.request_parameters, Mapping 3278 ): 3279 model.requester.request_parameters = self._remove_query_properties( 3280 model.requester.request_parameters 3281 ) 3282 elif ( 3283 hasattr(model.requester, "fetch_properties_from_endpoint") 3284 and model.requester.fetch_properties_from_endpoint 3285 ): 3286 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3287 query_properties_definition = QueryPropertiesModel( 3288 type="QueryProperties", 3289 property_list=model.requester.fetch_properties_from_endpoint, 3290 always_include_properties=None, 3291 property_chunking=None, 3292 ) # type: ignore # $parameters has a default value 3293 3294 query_properties = self.create_query_properties( 3295 model=query_properties_definition, 3296 stream_name=name, 3297 config=config, 3298 ) 3299 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3300 query_properties = self.create_query_properties( 3301 model=model.requester.query_properties, 3302 stream_name=name, 3303 config=config, 3304 ) 3305 3306 requester = self._create_component_from_model( 3307 model=model.requester, 3308 decoder=decoder, 3309 name=name, 3310 query_properties_key=query_properties_key, 3311 use_cache=use_cache, 3312 config=config, 3313 ) 3314 3315 if not request_options_provider: 3316 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3317 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3318 partition_router, PartitionRouter 3319 ): 3320 request_options_provider = partition_router 3321 3322 paginator = ( 3323 self._create_component_from_model( 3324 model=model.paginator, 3325 config=config, 3326 url_base=_get_url(requester), 3327 extractor_model=model.record_selector.extractor, 3328 decoder=decoder, 3329 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3330 ) 3331 if model.paginator 3332 else NoPagination(parameters={}) 3333 ) 3334 3335 ignore_stream_slicer_parameters_on_paginated_requests = ( 3336 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3337 ) 3338 3339 if ( 3340 model.partition_router 3341 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3342 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3343 and any( 3344 parent_stream_config.lazy_read_pointer 3345 for parent_stream_config in model.partition_router.parent_stream_configs 3346 ) 3347 ): 3348 if incremental_sync: 3349 if incremental_sync.type != "DatetimeBasedCursor": 3350 raise ValueError( 3351 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3352 ) 3353 3354 elif incremental_sync.step or incremental_sync.cursor_granularity: 3355 raise ValueError( 3356 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3357 ) 3358 3359 if model.decoder and model.decoder.type != "JsonDecoder": 3360 raise ValueError( 3361 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3362 ) 3363 3364 return LazySimpleRetriever( 3365 name=name, 3366 paginator=paginator, 3367 primary_key=primary_key, 3368 requester=requester, 3369 record_selector=record_selector, 3370 stream_slicer=_NO_STREAM_SLICING, 3371 request_option_provider=request_options_provider, 3372 config=config, 3373 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3374 parameters=model.parameters or {}, 3375 ) 3376 3377 if ( 3378 model.record_selector.record_filter 3379 and model.pagination_reset 3380 and model.pagination_reset.limits 3381 ): 3382 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3383 3384 return SimpleRetriever( 3385 name=name, 3386 paginator=paginator, 3387 primary_key=primary_key, 3388 requester=requester, 3389 record_selector=record_selector, 3390 stream_slicer=_NO_STREAM_SLICING, 3391 request_option_provider=request_options_provider, 3392 config=config, 3393 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3394 additional_query_properties=query_properties, 3395 log_formatter=self._get_log_formatter(log_formatter, name), 3396 pagination_tracker_factory=self._create_pagination_tracker_factory( 3397 model.pagination_reset, cursor 3398 ), 3399 parameters=model.parameters or {}, 3400 ) 3401 3402 def _create_pagination_tracker_factory( 3403 self, model: Optional[PaginationResetModel], cursor: Cursor 3404 ) -> Callable[[], PaginationTracker]: 3405 if model is None: 3406 return lambda: PaginationTracker() 3407 3408 # Until we figure out a way to use any cursor for PaginationTracker, we will have to have this cursor selector logic 3409 cursor_factory: Callable[[], Optional[ConcurrentCursor]] = lambda: None 3410 if model.action == PaginationResetActionModel.RESET: 3411 # in that case, we will let cursor_factory to return None even if the stream has a cursor 3412 pass 3413 elif model.action == PaginationResetActionModel.SPLIT_USING_CURSOR: 3414 if isinstance(cursor, ConcurrentCursor): 3415 cursor_factory = lambda: cursor.copy_without_state() # type: ignore # the if condition validates that it is a ConcurrentCursor 3416 elif isinstance(cursor, ConcurrentPerPartitionCursor): 3417 cursor_factory = lambda: cursor._cursor_factory.create( # type: ignore # if this becomes a problem, we would need to extract the cursor_factory instantiation logic and make it accessible here 3418 {}, datetime.timedelta(0) 3419 ) 3420 elif not isinstance(cursor, FinalStateCursor): 3421 LOGGER.warning( 3422 "Unknown cursor for PaginationTracker. Pagination resets might not work properly" 3423 ) 3424 else: 3425 raise ValueError(f"Unknown PaginationReset action: {model.action}") 3426 3427 limit = model.limits.number_of_records if model and model.limits else None 3428 return lambda: PaginationTracker(cursor_factory(), limit) 3429 3430 def _get_log_formatter( 3431 self, log_formatter: Callable[[Response], Any] | None, name: str 3432 ) -> Callable[[Response], Any] | None: 3433 if self._should_limit_slices_fetched(): 3434 return ( 3435 ( 3436 lambda response: format_http_message( 3437 response, 3438 f"Stream '{name}' request", 3439 f"Request performed in order to extract records for stream '{name}'", 3440 name, 3441 ) 3442 ) 3443 if not log_formatter 3444 else log_formatter 3445 ) 3446 return None 3447 3448 def _should_limit_slices_fetched(self) -> bool: 3449 """ 3450 Returns True if the number of slices fetched should be limited, False otherwise. 3451 This is used to limit the number of slices fetched during tests. 3452 """ 3453 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3454 3455 @staticmethod 3456 def _has_query_properties_in_request_parameters( 3457 requester: Union[HttpRequesterModel, CustomRequesterModel], 3458 ) -> bool: 3459 if not hasattr(requester, "request_parameters"): 3460 return False 3461 request_parameters = requester.request_parameters 3462 if request_parameters and isinstance(request_parameters, Mapping): 3463 for request_parameter in request_parameters.values(): 3464 if isinstance(request_parameter, QueryPropertiesModel): 3465 return True 3466 return False 3467 3468 @staticmethod 3469 def _remove_query_properties( 3470 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3471 ) -> Mapping[str, str]: 3472 return { 3473 parameter_field: request_parameter 3474 for parameter_field, request_parameter in request_parameters.items() 3475 if not isinstance(request_parameter, QueryPropertiesModel) 3476 } 3477 3478 def create_state_delegating_stream( 3479 self, 3480 model: StateDelegatingStreamModel, 3481 config: Config, 3482 has_parent_state: Optional[bool] = None, 3483 **kwargs: Any, 3484 ) -> DefaultStream: 3485 if ( 3486 model.full_refresh_stream.name != model.name 3487 or model.name != model.incremental_stream.name 3488 ): 3489 raise ValueError( 3490 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3491 ) 3492 3493 stream_model = self._get_state_delegating_stream_model( 3494 False if has_parent_state is None else has_parent_state, model 3495 ) 3496 3497 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # DeclarativeStream will be created as stream_model is alwyas DeclarativeStreamModel 3498 3499 def _get_state_delegating_stream_model( 3500 self, has_parent_state: bool, model: StateDelegatingStreamModel 3501 ) -> DeclarativeStreamModel: 3502 return ( 3503 model.incremental_stream 3504 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3505 else model.full_refresh_stream 3506 ) 3507 3508 def _create_async_job_status_mapping( 3509 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3510 ) -> Mapping[str, AsyncJobStatus]: 3511 api_status_to_cdk_status = {} 3512 for cdk_status, api_statuses in model.dict().items(): 3513 if cdk_status == "type": 3514 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3515 continue 3516 3517 for status in api_statuses: 3518 if status in api_status_to_cdk_status: 3519 raise ValueError( 3520 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3521 ) 3522 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3523 return api_status_to_cdk_status 3524 3525 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3526 match status: 3527 case "running": 3528 return AsyncJobStatus.RUNNING 3529 case "completed": 3530 return AsyncJobStatus.COMPLETED 3531 case "failed": 3532 return AsyncJobStatus.FAILED 3533 case "timeout": 3534 return AsyncJobStatus.TIMED_OUT 3535 case _: 3536 raise ValueError(f"Unsupported CDK status {status}") 3537 3538 def create_async_retriever( 3539 self, 3540 model: AsyncRetrieverModel, 3541 config: Config, 3542 *, 3543 name: str, 3544 primary_key: Optional[ 3545 Union[str, List[str], List[List[str]]] 3546 ], # this seems to be needed to match create_simple_retriever 3547 stream_slicer: Optional[StreamSlicer], 3548 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3549 transformations: List[RecordTransformation], 3550 **kwargs: Any, 3551 ) -> AsyncRetriever: 3552 if model.download_target_requester and not model.download_target_extractor: 3553 raise ValueError( 3554 f"`download_target_extractor` required if using a `download_target_requester`" 3555 ) 3556 3557 def _get_download_retriever( 3558 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3559 ) -> SimpleRetriever: 3560 # We create a record selector for the download retriever 3561 # with no schema normalization and no transformations, neither record filter 3562 # as all this occurs in the record_selector of the AsyncRetriever 3563 record_selector = RecordSelector( 3564 extractor=extractor, 3565 name=name, 3566 record_filter=None, 3567 transformations=[], 3568 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3569 config=config, 3570 parameters={}, 3571 ) 3572 paginator = ( 3573 self._create_component_from_model( 3574 model=model.download_paginator, 3575 decoder=_decoder, 3576 config=config, 3577 url_base="", 3578 ) 3579 if model.download_paginator 3580 else NoPagination(parameters={}) 3581 ) 3582 3583 return SimpleRetriever( 3584 requester=requester, 3585 record_selector=record_selector, 3586 primary_key=None, 3587 name=name, 3588 paginator=paginator, 3589 config=config, 3590 parameters={}, 3591 log_formatter=self._get_log_formatter(None, name), 3592 ) 3593 3594 def _get_job_timeout() -> datetime.timedelta: 3595 user_defined_timeout: Optional[int] = ( 3596 int( 3597 InterpolatedString.create( 3598 str(model.polling_job_timeout), 3599 parameters={}, 3600 ).eval(config) 3601 ) 3602 if model.polling_job_timeout 3603 else None 3604 ) 3605 3606 # check for user defined timeout during the test read or 15 minutes 3607 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3608 # default value for non-connector builder is 60 minutes. 3609 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3610 3611 return ( 3612 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3613 ) 3614 3615 decoder = ( 3616 self._create_component_from_model(model=model.decoder, config=config) 3617 if model.decoder 3618 else JsonDecoder(parameters={}) 3619 ) 3620 record_selector = self._create_component_from_model( 3621 model=model.record_selector, 3622 config=config, 3623 decoder=decoder, 3624 name=name, 3625 transformations=transformations, 3626 client_side_incremental_sync=client_side_incremental_sync, 3627 ) 3628 3629 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3630 if self._should_limit_slices_fetched(): 3631 stream_slicer = cast( 3632 StreamSlicer, 3633 StreamSlicerTestReadDecorator( 3634 wrapped_slicer=stream_slicer, 3635 maximum_number_of_slices=self._limit_slices_fetched or 5, 3636 ), 3637 ) 3638 3639 creation_requester = self._create_component_from_model( 3640 model=model.creation_requester, 3641 decoder=decoder, 3642 config=config, 3643 name=f"job creation - {name}", 3644 ) 3645 polling_requester = self._create_component_from_model( 3646 model=model.polling_requester, 3647 decoder=decoder, 3648 config=config, 3649 name=f"job polling - {name}", 3650 ) 3651 job_download_components_name = f"job download - {name}" 3652 download_decoder = ( 3653 self._create_component_from_model(model=model.download_decoder, config=config) 3654 if model.download_decoder 3655 else JsonDecoder(parameters={}) 3656 ) 3657 download_extractor = ( 3658 self._create_component_from_model( 3659 model=model.download_extractor, 3660 config=config, 3661 decoder=download_decoder, 3662 parameters=model.parameters, 3663 ) 3664 if model.download_extractor 3665 else DpathExtractor( 3666 [], 3667 config=config, 3668 decoder=download_decoder, 3669 parameters=model.parameters or {}, 3670 ) 3671 ) 3672 download_requester = self._create_component_from_model( 3673 model=model.download_requester, 3674 decoder=download_decoder, 3675 config=config, 3676 name=job_download_components_name, 3677 ) 3678 download_retriever = _get_download_retriever( 3679 download_requester, download_extractor, download_decoder 3680 ) 3681 abort_requester = ( 3682 self._create_component_from_model( 3683 model=model.abort_requester, 3684 decoder=decoder, 3685 config=config, 3686 name=f"job abort - {name}", 3687 ) 3688 if model.abort_requester 3689 else None 3690 ) 3691 delete_requester = ( 3692 self._create_component_from_model( 3693 model=model.delete_requester, 3694 decoder=decoder, 3695 config=config, 3696 name=f"job delete - {name}", 3697 ) 3698 if model.delete_requester 3699 else None 3700 ) 3701 download_target_requester = ( 3702 self._create_component_from_model( 3703 model=model.download_target_requester, 3704 decoder=decoder, 3705 config=config, 3706 name=f"job extract_url - {name}", 3707 ) 3708 if model.download_target_requester 3709 else None 3710 ) 3711 status_extractor = self._create_component_from_model( 3712 model=model.status_extractor, decoder=decoder, config=config, name=name 3713 ) 3714 download_target_extractor = ( 3715 self._create_component_from_model( 3716 model=model.download_target_extractor, 3717 decoder=decoder, 3718 config=config, 3719 name=name, 3720 ) 3721 if model.download_target_extractor 3722 else None 3723 ) 3724 3725 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3726 creation_requester=creation_requester, 3727 polling_requester=polling_requester, 3728 download_retriever=download_retriever, 3729 download_target_requester=download_target_requester, 3730 abort_requester=abort_requester, 3731 delete_requester=delete_requester, 3732 status_extractor=status_extractor, 3733 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3734 download_target_extractor=download_target_extractor, 3735 job_timeout=_get_job_timeout(), 3736 ) 3737 3738 async_job_partition_router = AsyncJobPartitionRouter( 3739 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3740 job_repository, 3741 stream_slices, 3742 self._job_tracker, 3743 self._message_repository, 3744 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3745 has_bulk_parent=False, 3746 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3747 # `None` == default retry is set to 3 attempts, under the hood. 3748 job_max_retry=1 if self._emit_connector_builder_messages else None, 3749 ), 3750 stream_slicer=stream_slicer, 3751 config=config, 3752 parameters=model.parameters or {}, 3753 ) 3754 3755 return AsyncRetriever( 3756 record_selector=record_selector, 3757 stream_slicer=async_job_partition_router, 3758 config=config, 3759 parameters=model.parameters or {}, 3760 ) 3761 3762 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3763 config_migrations = [ 3764 self._create_component_from_model(migration, config) 3765 for migration in ( 3766 model.config_normalization_rules.config_migrations 3767 if ( 3768 model.config_normalization_rules 3769 and model.config_normalization_rules.config_migrations 3770 ) 3771 else [] 3772 ) 3773 ] 3774 config_transformations = [ 3775 self._create_component_from_model(transformation, config) 3776 for transformation in ( 3777 model.config_normalization_rules.transformations 3778 if ( 3779 model.config_normalization_rules 3780 and model.config_normalization_rules.transformations 3781 ) 3782 else [] 3783 ) 3784 ] 3785 config_validations = [ 3786 self._create_component_from_model(validation, config) 3787 for validation in ( 3788 model.config_normalization_rules.validations 3789 if ( 3790 model.config_normalization_rules 3791 and model.config_normalization_rules.validations 3792 ) 3793 else [] 3794 ) 3795 ] 3796 3797 return Spec( 3798 connection_specification=model.connection_specification, 3799 documentation_url=model.documentation_url, 3800 advanced_auth=model.advanced_auth, 3801 parameters={}, 3802 config_migrations=config_migrations, 3803 config_transformations=config_transformations, 3804 config_validations=config_validations, 3805 ) 3806 3807 def create_substream_partition_router( 3808 self, 3809 model: SubstreamPartitionRouterModel, 3810 config: Config, 3811 *, 3812 stream_name: str, 3813 **kwargs: Any, 3814 ) -> SubstreamPartitionRouter: 3815 parent_stream_configs = [] 3816 if model.parent_stream_configs: 3817 parent_stream_configs.extend( 3818 [ 3819 self.create_parent_stream_config_with_substream_wrapper( 3820 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 3821 ) 3822 for parent_stream_config in model.parent_stream_configs 3823 ] 3824 ) 3825 3826 return SubstreamPartitionRouter( 3827 parent_stream_configs=parent_stream_configs, 3828 parameters=model.parameters or {}, 3829 config=config, 3830 ) 3831 3832 def create_parent_stream_config_with_substream_wrapper( 3833 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3834 ) -> Any: 3835 # getting the parent state 3836 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 3837 3838 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3839 has_parent_state = bool( 3840 self._connector_state_manager.get_stream_state(stream_name, None) 3841 if model.incremental_dependency 3842 else False 3843 ) 3844 connector_state_manager = self._instantiate_parent_stream_state_manager( 3845 child_state, config, model, has_parent_state 3846 ) 3847 3848 substream_factory = ModelToComponentFactory( 3849 connector_state_manager=connector_state_manager, 3850 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3851 limit_slices_fetched=self._limit_slices_fetched, 3852 emit_connector_builder_messages=self._emit_connector_builder_messages, 3853 disable_retries=self._disable_retries, 3854 disable_cache=self._disable_cache, 3855 message_repository=StateFilteringMessageRepository( 3856 LogAppenderMessageRepositoryDecorator( 3857 { 3858 "airbyte_cdk": {"stream": {"is_substream": True}}, 3859 "http": {"is_auxiliary": True}, 3860 }, 3861 self._message_repository, 3862 self._evaluate_log_level(self._emit_connector_builder_messages), 3863 ), 3864 ), 3865 api_budget=self._api_budget, 3866 ) 3867 3868 return substream_factory.create_parent_stream_config( 3869 model=model, config=config, stream_name=stream_name, **kwargs 3870 ) 3871 3872 def _instantiate_parent_stream_state_manager( 3873 self, 3874 child_state: MutableMapping[str, Any], 3875 config: Config, 3876 model: ParentStreamConfigModel, 3877 has_parent_state: bool, 3878 ) -> ConnectorStateManager: 3879 """ 3880 With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the 3881 `set_initial_state` flow that existed for the declarative cursors. This state is taken from 3882 self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account 3883 for the MessageRepository being different). So we need to pass a ConnectorStateManager to the 3884 ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if 3885 incremental_dependency is set. 3886 """ 3887 if model.incremental_dependency and child_state: 3888 parent_stream_name = model.stream.name or "" 3889 parent_state = ConcurrentPerPartitionCursor.get_parent_state( 3890 child_state, parent_stream_name 3891 ) 3892 3893 if not parent_state: 3894 # there are two migration cases: state value from child stream or from global state 3895 parent_state = ConcurrentPerPartitionCursor.get_global_state( 3896 child_state, parent_stream_name 3897 ) 3898 3899 if not parent_state and not isinstance(parent_state, dict): 3900 cursor_values = child_state.values() 3901 if cursor_values and len(cursor_values) == 1: 3902 # We assume the child state is a pair `{<cursor_field>: <cursor_value>}` and we will use the 3903 # cursor value as a parent state. 3904 incremental_sync_model: Union[ 3905 DatetimeBasedCursorModel, 3906 IncrementingCountCursorModel, 3907 ] = ( 3908 model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream 3909 if isinstance(model.stream, DeclarativeStreamModel) 3910 else self._get_state_delegating_stream_model( 3911 has_parent_state, model.stream 3912 ).incremental_sync 3913 ) 3914 cursor_field = InterpolatedString.create( 3915 incremental_sync_model.cursor_field, 3916 parameters=incremental_sync_model.parameters or {}, 3917 ).eval(config) 3918 parent_state = AirbyteStateMessage( 3919 type=AirbyteStateType.STREAM, 3920 stream=AirbyteStreamState( 3921 stream_descriptor=StreamDescriptor( 3922 name=parent_stream_name, namespace=None 3923 ), 3924 stream_state=AirbyteStateBlob( 3925 {cursor_field: list(cursor_values)[0]} 3926 ), 3927 ), 3928 ) 3929 return ConnectorStateManager([parent_state] if parent_state else []) 3930 3931 return ConnectorStateManager([]) 3932 3933 @staticmethod 3934 def create_wait_time_from_header( 3935 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3936 ) -> WaitTimeFromHeaderBackoffStrategy: 3937 return WaitTimeFromHeaderBackoffStrategy( 3938 header=model.header, 3939 parameters=model.parameters or {}, 3940 config=config, 3941 regex=model.regex, 3942 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3943 if model.max_waiting_time_in_seconds is not None 3944 else None, 3945 ) 3946 3947 @staticmethod 3948 def create_wait_until_time_from_header( 3949 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3950 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3951 return WaitUntilTimeFromHeaderBackoffStrategy( 3952 header=model.header, 3953 parameters=model.parameters or {}, 3954 config=config, 3955 min_wait=model.min_wait, 3956 regex=model.regex, 3957 ) 3958 3959 def get_message_repository(self) -> MessageRepository: 3960 return self._message_repository 3961 3962 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3963 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3964 3965 @staticmethod 3966 def create_components_mapping_definition( 3967 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3968 ) -> ComponentMappingDefinition: 3969 interpolated_value = InterpolatedString.create( 3970 model.value, parameters=model.parameters or {} 3971 ) 3972 field_path = [ 3973 InterpolatedString.create(path, parameters=model.parameters or {}) 3974 for path in model.field_path 3975 ] 3976 return ComponentMappingDefinition( 3977 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3978 value=interpolated_value, 3979 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3980 create_or_update=model.create_or_update, 3981 condition=model.condition, 3982 parameters=model.parameters or {}, 3983 ) 3984 3985 def create_http_components_resolver( 3986 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 3987 ) -> Any: 3988 retriever = self._create_component_from_model( 3989 model=model.retriever, 3990 config=config, 3991 name=f"{stream_name if stream_name else '__http_components_resolver'}", 3992 primary_key=None, 3993 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 3994 transformations=[], 3995 ) 3996 3997 components_mapping = [] 3998 for component_mapping_definition_model in model.components_mapping: 3999 if component_mapping_definition_model.condition: 4000 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4001 components_mapping.append( 4002 self._create_component_from_model( 4003 model=component_mapping_definition_model, 4004 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4005 component_mapping_definition_model.value_type 4006 ), 4007 config=config, 4008 ) 4009 ) 4010 4011 return HttpComponentsResolver( 4012 retriever=retriever, 4013 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4014 config=config, 4015 components_mapping=components_mapping, 4016 parameters=model.parameters or {}, 4017 ) 4018 4019 @staticmethod 4020 def create_stream_config( 4021 model: StreamConfigModel, config: Config, **kwargs: Any 4022 ) -> StreamConfig: 4023 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4024 [x for x in model.configs_pointer] if model.configs_pointer else [] 4025 ) 4026 4027 return StreamConfig( 4028 configs_pointer=model_configs_pointer, 4029 default_values=model.default_values, 4030 parameters=model.parameters or {}, 4031 ) 4032 4033 def create_config_components_resolver( 4034 self, 4035 model: ConfigComponentsResolverModel, 4036 config: Config, 4037 ) -> Any: 4038 model_stream_configs = ( 4039 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4040 ) 4041 4042 stream_configs = [ 4043 self._create_component_from_model( 4044 stream_config, config=config, parameters=model.parameters or {} 4045 ) 4046 for stream_config in model_stream_configs 4047 ] 4048 4049 components_mapping = [ 4050 self._create_component_from_model( 4051 model=components_mapping_definition_model, 4052 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4053 components_mapping_definition_model.value_type 4054 ), 4055 config=config, 4056 parameters=model.parameters, 4057 ) 4058 for components_mapping_definition_model in model.components_mapping 4059 ] 4060 4061 return ConfigComponentsResolver( 4062 stream_configs=stream_configs, 4063 config=config, 4064 components_mapping=components_mapping, 4065 parameters=model.parameters or {}, 4066 ) 4067 4068 def create_parametrized_components_resolver( 4069 self, 4070 model: ParametrizedComponentsResolverModel, 4071 config: Config, 4072 ) -> ParametrizedComponentsResolver: 4073 stream_parameters = StreamParametersDefinition( 4074 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4075 ) 4076 4077 components_mapping = [] 4078 for components_mapping_definition_model in model.components_mapping: 4079 if components_mapping_definition_model.condition: 4080 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4081 components_mapping.append( 4082 self._create_component_from_model( 4083 model=components_mapping_definition_model, 4084 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4085 components_mapping_definition_model.value_type 4086 ), 4087 config=config, 4088 ) 4089 ) 4090 return ParametrizedComponentsResolver( 4091 stream_parameters=stream_parameters, 4092 config=config, 4093 components_mapping=components_mapping, 4094 parameters=model.parameters or {}, 4095 ) 4096 4097 _UNSUPPORTED_DECODER_ERROR = ( 4098 "Specified decoder of {decoder_type} is not supported for pagination." 4099 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 4100 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 4101 ) 4102 4103 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 4104 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 4105 return True 4106 elif isinstance(decoder, CompositeRawDecoder): 4107 return self._is_supported_parser_for_pagination(decoder.parser) 4108 else: 4109 return False 4110 4111 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 4112 if isinstance(parser, JsonParser): 4113 return True 4114 elif isinstance(parser, GzipParser): 4115 return isinstance(parser.inner_parser, JsonParser) 4116 else: 4117 return False 4118 4119 def create_http_api_budget( 4120 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4121 ) -> HttpAPIBudget: 4122 policies = [ 4123 self._create_component_from_model(model=policy, config=config) 4124 for policy in model.policies 4125 ] 4126 4127 return HttpAPIBudget( 4128 policies=policies, 4129 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4130 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4131 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4132 ) 4133 4134 def create_fixed_window_call_rate_policy( 4135 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4136 ) -> FixedWindowCallRatePolicy: 4137 matchers = [ 4138 self._create_component_from_model(model=matcher, config=config) 4139 for matcher in model.matchers 4140 ] 4141 4142 # Set the initial reset timestamp to 10 days from now. 4143 # This value will be updated by the first request. 4144 return FixedWindowCallRatePolicy( 4145 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4146 period=parse_duration(model.period), 4147 call_limit=model.call_limit, 4148 matchers=matchers, 4149 ) 4150 4151 def create_file_uploader( 4152 self, model: FileUploaderModel, config: Config, **kwargs: Any 4153 ) -> FileUploader: 4154 name = "File Uploader" 4155 requester = self._create_component_from_model( 4156 model=model.requester, 4157 config=config, 4158 name=name, 4159 **kwargs, 4160 ) 4161 download_target_extractor = self._create_component_from_model( 4162 model=model.download_target_extractor, 4163 config=config, 4164 name=name, 4165 **kwargs, 4166 ) 4167 emit_connector_builder_messages = self._emit_connector_builder_messages 4168 file_uploader = DefaultFileUploader( 4169 requester=requester, 4170 download_target_extractor=download_target_extractor, 4171 config=config, 4172 file_writer=NoopFileWriter() 4173 if emit_connector_builder_messages 4174 else LocalFileSystemFileWriter(), 4175 parameters=model.parameters or {}, 4176 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4177 ) 4178 4179 return ( 4180 ConnectorBuilderFileUploader(file_uploader) 4181 if emit_connector_builder_messages 4182 else file_uploader 4183 ) 4184 4185 def create_moving_window_call_rate_policy( 4186 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4187 ) -> MovingWindowCallRatePolicy: 4188 rates = [ 4189 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4190 ] 4191 matchers = [ 4192 self._create_component_from_model(model=matcher, config=config) 4193 for matcher in model.matchers 4194 ] 4195 return MovingWindowCallRatePolicy( 4196 rates=rates, 4197 matchers=matchers, 4198 ) 4199 4200 def create_unlimited_call_rate_policy( 4201 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4202 ) -> UnlimitedCallRatePolicy: 4203 matchers = [ 4204 self._create_component_from_model(model=matcher, config=config) 4205 for matcher in model.matchers 4206 ] 4207 4208 return UnlimitedCallRatePolicy( 4209 matchers=matchers, 4210 ) 4211 4212 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4213 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4214 return Rate( 4215 limit=int(interpolated_limit.eval(config=config)), 4216 interval=parse_duration(model.interval), 4217 ) 4218 4219 def create_http_request_matcher( 4220 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4221 ) -> HttpRequestRegexMatcher: 4222 return HttpRequestRegexMatcher( 4223 method=model.method, 4224 url_base=model.url_base, 4225 url_path_pattern=model.url_path_pattern, 4226 params=model.params, 4227 headers=model.headers, 4228 ) 4229 4230 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4231 self._api_budget = self.create_component( 4232 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4233 ) 4234 4235 def create_grouping_partition_router( 4236 self, 4237 model: GroupingPartitionRouterModel, 4238 config: Config, 4239 *, 4240 stream_name: str, 4241 **kwargs: Any, 4242 ) -> GroupingPartitionRouter: 4243 underlying_router = self._create_component_from_model( 4244 model=model.underlying_partition_router, 4245 config=config, 4246 stream_name=stream_name, 4247 **kwargs, 4248 ) 4249 if model.group_size < 1: 4250 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4251 4252 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4253 # because they are specific to individual partitions and cannot be aggregated or handled 4254 # when grouping, potentially leading to incorrect API calls. Any request customization 4255 # should be managed at the stream level through the requester's configuration. 4256 if isinstance(underlying_router, SubstreamPartitionRouter): 4257 if any( 4258 parent_config.request_option 4259 for parent_config in underlying_router.parent_stream_configs 4260 ): 4261 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4262 4263 if isinstance(underlying_router, ListPartitionRouter): 4264 if underlying_router.request_option: 4265 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4266 4267 return GroupingPartitionRouter( 4268 group_size=model.group_size, 4269 underlying_partition_router=underlying_router, 4270 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4271 config=config, 4272 ) 4273 4274 def _ensure_query_properties_to_model( 4275 self, requester: Union[HttpRequesterModel, CustomRequesterModel] 4276 ) -> None: 4277 """ 4278 For some reason, it seems like CustomRequesterModel request_parameters stays as dictionaries which means that 4279 the other conditions relying on it being QueryPropertiesModel instead of a dict fail. Here, we migrate them to 4280 proper model. 4281 """ 4282 if not hasattr(requester, "request_parameters"): 4283 return 4284 4285 request_parameters = requester.request_parameters 4286 if request_parameters and isinstance(request_parameters, Dict): 4287 for request_parameter_key in request_parameters.keys(): 4288 request_parameter = request_parameters[request_parameter_key] 4289 if ( 4290 isinstance(request_parameter, Dict) 4291 and request_parameter.get("type") == "QueryProperties" 4292 ): 4293 request_parameters[request_parameter_key] = QueryPropertiesModel.parse_obj( 4294 request_parameter 4295 )
671class ModelToComponentFactory: 672 EPOCH_DATETIME_FORMAT = "%s" 673 674 def __init__( 675 self, 676 limit_pages_fetched_per_slice: Optional[int] = None, 677 limit_slices_fetched: Optional[int] = None, 678 emit_connector_builder_messages: bool = False, 679 disable_retries: bool = False, 680 disable_cache: bool = False, 681 message_repository: Optional[MessageRepository] = None, 682 connector_state_manager: Optional[ConnectorStateManager] = None, 683 max_concurrent_async_job_count: Optional[int] = None, 684 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 685 api_budget: Optional[APIBudget] = None, 686 ): 687 self._init_mappings() 688 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 689 self._limit_slices_fetched = limit_slices_fetched 690 self._emit_connector_builder_messages = emit_connector_builder_messages 691 self._disable_retries = disable_retries 692 self._disable_cache = disable_cache 693 self._message_repository = message_repository or InMemoryMessageRepository( 694 self._evaluate_log_level(emit_connector_builder_messages) 695 ) 696 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 697 configured_catalog 698 ) 699 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 700 self._api_budget: Optional[Union[APIBudget]] = api_budget 701 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 702 # placeholder for deprecation warnings 703 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 704 705 def _init_mappings(self) -> None: 706 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 707 AddedFieldDefinitionModel: self.create_added_field_definition, 708 AddFieldsModel: self.create_add_fields, 709 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 710 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 711 BearerAuthenticatorModel: self.create_bearer_authenticator, 712 CheckStreamModel: self.create_check_stream, 713 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 714 CheckDynamicStreamModel: self.create_check_dynamic_stream, 715 CompositeErrorHandlerModel: self.create_composite_error_handler, 716 ConcurrencyLevelModel: self.create_concurrency_level, 717 ConfigMigrationModel: self.create_config_migration, 718 ConfigAddFieldsModel: self.create_config_add_fields, 719 ConfigRemapFieldModel: self.create_config_remap_field, 720 ConfigRemoveFieldsModel: self.create_config_remove_fields, 721 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 722 CsvDecoderModel: self.create_csv_decoder, 723 CursorPaginationModel: self.create_cursor_pagination, 724 CustomAuthenticatorModel: self.create_custom_component, 725 CustomBackoffStrategyModel: self.create_custom_component, 726 CustomDecoderModel: self.create_custom_component, 727 CustomErrorHandlerModel: self.create_custom_component, 728 CustomRecordExtractorModel: self.create_custom_component, 729 CustomRecordFilterModel: self.create_custom_component, 730 CustomRequesterModel: self.create_custom_component, 731 CustomRetrieverModel: self.create_custom_component, 732 CustomSchemaLoader: self.create_custom_component, 733 CustomSchemaNormalizationModel: self.create_custom_component, 734 CustomStateMigration: self.create_custom_component, 735 CustomPaginationStrategyModel: self.create_custom_component, 736 CustomPartitionRouterModel: self.create_custom_component, 737 CustomTransformationModel: self.create_custom_component, 738 CustomValidationStrategyModel: self.create_custom_component, 739 CustomConfigTransformationModel: self.create_custom_component, 740 DeclarativeStreamModel: self.create_default_stream, 741 DefaultErrorHandlerModel: self.create_default_error_handler, 742 DefaultPaginatorModel: self.create_default_paginator, 743 DpathExtractorModel: self.create_dpath_extractor, 744 DpathValidatorModel: self.create_dpath_validator, 745 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 746 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 747 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 748 GroupByKeyMergeStrategyModel: self.create_group_by_key, 749 HttpRequesterModel: self.create_http_requester, 750 HttpResponseFilterModel: self.create_http_response_filter, 751 InlineSchemaLoaderModel: self.create_inline_schema_loader, 752 JsonDecoderModel: self.create_json_decoder, 753 JsonlDecoderModel: self.create_jsonl_decoder, 754 JsonSchemaPropertySelectorModel: self.create_json_schema_property_selector, 755 GzipDecoderModel: self.create_gzip_decoder, 756 KeysToLowerModel: self.create_keys_to_lower_transformation, 757 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 758 KeysReplaceModel: self.create_keys_replace_transformation, 759 FlattenFieldsModel: self.create_flatten_fields, 760 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 761 IterableDecoderModel: self.create_iterable_decoder, 762 XmlDecoderModel: self.create_xml_decoder, 763 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 764 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 765 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 766 TypesMapModel: self.create_types_map, 767 ComplexFieldTypeModel: self.create_complex_field_type, 768 JwtAuthenticatorModel: self.create_jwt_authenticator, 769 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 770 ListPartitionRouterModel: self.create_list_partition_router, 771 MinMaxDatetimeModel: self.create_min_max_datetime, 772 NoAuthModel: self.create_no_auth, 773 NoPaginationModel: self.create_no_pagination, 774 OAuthAuthenticatorModel: self.create_oauth_authenticator, 775 OffsetIncrementModel: self.create_offset_increment, 776 PageIncrementModel: self.create_page_increment, 777 ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, 778 PredicateValidatorModel: self.create_predicate_validator, 779 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 780 PropertyChunkingModel: self.create_property_chunking, 781 QueryPropertiesModel: self.create_query_properties, 782 RecordFilterModel: self.create_record_filter, 783 RecordSelectorModel: self.create_record_selector, 784 RemoveFieldsModel: self.create_remove_fields, 785 RequestPathModel: self.create_request_path, 786 RequestOptionModel: self.create_request_option, 787 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 788 SelectiveAuthenticatorModel: self.create_selective_authenticator, 789 SimpleRetrieverModel: self.create_simple_retriever, 790 StateDelegatingStreamModel: self.create_state_delegating_stream, 791 SpecModel: self.create_spec, 792 SubstreamPartitionRouterModel: self.create_substream_partition_router, 793 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 794 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 795 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 796 AsyncRetrieverModel: self.create_async_retriever, 797 HttpComponentsResolverModel: self.create_http_components_resolver, 798 ConfigComponentsResolverModel: self.create_config_components_resolver, 799 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 800 StreamConfigModel: self.create_stream_config, 801 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 802 ZipfileDecoderModel: self.create_zipfile_decoder, 803 HTTPAPIBudgetModel: self.create_http_api_budget, 804 FileUploaderModel: self.create_file_uploader, 805 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 806 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 807 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 808 RateModel: self.create_rate, 809 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 810 GroupingPartitionRouterModel: self.create_grouping_partition_router, 811 } 812 813 # Needed for the case where we need to perform a second parse on the fields of a custom component 814 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 815 816 @staticmethod 817 def _create_stream_name_to_configured_stream( 818 configured_catalog: Optional[ConfiguredAirbyteCatalog], 819 ) -> Mapping[str, ConfiguredAirbyteStream]: 820 return ( 821 {stream.stream.name: stream for stream in configured_catalog.streams} 822 if configured_catalog 823 else {} 824 ) 825 826 def create_component( 827 self, 828 model_type: Type[BaseModel], 829 component_definition: ComponentDefinition, 830 config: Config, 831 **kwargs: Any, 832 ) -> Any: 833 """ 834 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 835 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 836 creating declarative components from that model. 837 838 :param model_type: The type of declarative component that is being initialized 839 :param component_definition: The mapping that represents a declarative component 840 :param config: The connector config that is provided by the customer 841 :return: The declarative component to be used at runtime 842 """ 843 844 component_type = component_definition.get("type") 845 if component_definition.get("type") != model_type.__name__: 846 raise ValueError( 847 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 848 ) 849 850 declarative_component_model = model_type.parse_obj(component_definition) 851 852 if not isinstance(declarative_component_model, model_type): 853 raise ValueError( 854 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 855 ) 856 857 return self._create_component_from_model( 858 model=declarative_component_model, config=config, **kwargs 859 ) 860 861 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 862 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 863 raise ValueError( 864 f"{model.__class__} with attributes {model} is not a valid component type" 865 ) 866 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 867 if not component_constructor: 868 raise ValueError(f"Could not find constructor for {model.__class__}") 869 870 # collect deprecation warnings for supported models. 871 if isinstance(model, BaseModelWithDeprecations): 872 self._collect_model_deprecations(model) 873 874 return component_constructor(model=model, config=config, **kwargs) 875 876 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 877 """ 878 Returns the deprecation warnings that were collected during the creation of components. 879 """ 880 return self._collected_deprecation_logs 881 882 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 883 """ 884 Collects deprecation logs from the given model and appends any new logs to the internal collection. 885 886 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 887 888 Args: 889 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 890 """ 891 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 892 for log in model._deprecation_logs: 893 # avoid duplicates for deprecation logs observed. 894 if log not in self._collected_deprecation_logs: 895 self._collected_deprecation_logs.append(log) 896 897 def create_config_migration( 898 self, model: ConfigMigrationModel, config: Config 899 ) -> ConfigMigration: 900 transformations: List[ConfigTransformation] = [ 901 self._create_component_from_model(transformation, config) 902 for transformation in model.transformations 903 ] 904 905 return ConfigMigration( 906 description=model.description, 907 transformations=transformations, 908 ) 909 910 def create_config_add_fields( 911 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 912 ) -> ConfigAddFields: 913 fields = [self._create_component_from_model(field, config) for field in model.fields] 914 return ConfigAddFields( 915 fields=fields, 916 condition=model.condition or "", 917 ) 918 919 @staticmethod 920 def create_config_remove_fields( 921 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 922 ) -> ConfigRemoveFields: 923 return ConfigRemoveFields( 924 field_pointers=model.field_pointers, 925 condition=model.condition or "", 926 ) 927 928 @staticmethod 929 def create_config_remap_field( 930 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 931 ) -> ConfigRemapField: 932 mapping = cast(Mapping[str, Any], model.map) 933 return ConfigRemapField( 934 map=mapping, 935 field_path=model.field_path, 936 config=config, 937 ) 938 939 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 940 strategy = self._create_component_from_model(model.validation_strategy, config) 941 942 return DpathValidator( 943 field_path=model.field_path, 944 strategy=strategy, 945 ) 946 947 def create_predicate_validator( 948 self, model: PredicateValidatorModel, config: Config 949 ) -> PredicateValidator: 950 strategy = self._create_component_from_model(model.validation_strategy, config) 951 952 return PredicateValidator( 953 value=model.value, 954 strategy=strategy, 955 ) 956 957 @staticmethod 958 def create_validate_adheres_to_schema( 959 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 960 ) -> ValidateAdheresToSchema: 961 base_schema = cast(Mapping[str, Any], model.base_schema) 962 return ValidateAdheresToSchema( 963 schema=base_schema, 964 ) 965 966 @staticmethod 967 def create_added_field_definition( 968 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 969 ) -> AddedFieldDefinition: 970 interpolated_value = InterpolatedString.create( 971 model.value, parameters=model.parameters or {} 972 ) 973 return AddedFieldDefinition( 974 path=model.path, 975 value=interpolated_value, 976 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 977 parameters=model.parameters or {}, 978 ) 979 980 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 981 added_field_definitions = [ 982 self._create_component_from_model( 983 model=added_field_definition_model, 984 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 985 added_field_definition_model.value_type 986 ), 987 config=config, 988 ) 989 for added_field_definition_model in model.fields 990 ] 991 return AddFields( 992 fields=added_field_definitions, 993 condition=model.condition or "", 994 parameters=model.parameters or {}, 995 ) 996 997 def create_keys_to_lower_transformation( 998 self, model: KeysToLowerModel, config: Config, **kwargs: Any 999 ) -> KeysToLowerTransformation: 1000 return KeysToLowerTransformation() 1001 1002 def create_keys_to_snake_transformation( 1003 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 1004 ) -> KeysToSnakeCaseTransformation: 1005 return KeysToSnakeCaseTransformation() 1006 1007 def create_keys_replace_transformation( 1008 self, model: KeysReplaceModel, config: Config, **kwargs: Any 1009 ) -> KeysReplaceTransformation: 1010 return KeysReplaceTransformation( 1011 old=model.old, new=model.new, parameters=model.parameters or {} 1012 ) 1013 1014 def create_flatten_fields( 1015 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 1016 ) -> FlattenFields: 1017 return FlattenFields( 1018 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 1019 ) 1020 1021 def create_dpath_flatten_fields( 1022 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1023 ) -> DpathFlattenFields: 1024 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1025 key_transformation = ( 1026 KeyTransformation( 1027 config=config, 1028 prefix=model.key_transformation.prefix, 1029 suffix=model.key_transformation.suffix, 1030 parameters=model.parameters or {}, 1031 ) 1032 if model.key_transformation is not None 1033 else None 1034 ) 1035 return DpathFlattenFields( 1036 config=config, 1037 field_path=model_field_path, 1038 delete_origin_value=model.delete_origin_value 1039 if model.delete_origin_value is not None 1040 else False, 1041 replace_record=model.replace_record if model.replace_record is not None else False, 1042 key_transformation=key_transformation, 1043 parameters=model.parameters or {}, 1044 ) 1045 1046 @staticmethod 1047 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 1048 if not value_type: 1049 return None 1050 names_to_types = { 1051 ValueType.string: str, 1052 ValueType.number: float, 1053 ValueType.integer: int, 1054 ValueType.boolean: bool, 1055 } 1056 return names_to_types[value_type] 1057 1058 def create_api_key_authenticator( 1059 self, 1060 model: ApiKeyAuthenticatorModel, 1061 config: Config, 1062 token_provider: Optional[TokenProvider] = None, 1063 **kwargs: Any, 1064 ) -> ApiKeyAuthenticator: 1065 if model.inject_into is None and model.header is None: 1066 raise ValueError( 1067 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1068 ) 1069 1070 if model.inject_into is not None and model.header is not None: 1071 raise ValueError( 1072 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1073 ) 1074 1075 if token_provider is not None and model.api_token != "": 1076 raise ValueError( 1077 "If token_provider is set, api_token is ignored and has to be set to empty string." 1078 ) 1079 1080 request_option = ( 1081 self._create_component_from_model( 1082 model.inject_into, config, parameters=model.parameters or {} 1083 ) 1084 if model.inject_into 1085 else RequestOption( 1086 inject_into=RequestOptionType.header, 1087 field_name=model.header or "", 1088 parameters=model.parameters or {}, 1089 ) 1090 ) 1091 1092 return ApiKeyAuthenticator( 1093 token_provider=( 1094 token_provider 1095 if token_provider is not None 1096 else InterpolatedStringTokenProvider( 1097 api_token=model.api_token or "", 1098 config=config, 1099 parameters=model.parameters or {}, 1100 ) 1101 ), 1102 request_option=request_option, 1103 config=config, 1104 parameters=model.parameters or {}, 1105 ) 1106 1107 def create_legacy_to_per_partition_state_migration( 1108 self, 1109 model: LegacyToPerPartitionStateMigrationModel, 1110 config: Mapping[str, Any], 1111 declarative_stream: DeclarativeStreamModel, 1112 ) -> LegacyToPerPartitionStateMigration: 1113 retriever = declarative_stream.retriever 1114 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1115 raise ValueError( 1116 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1117 ) 1118 partition_router = retriever.partition_router 1119 if not isinstance( 1120 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1121 ): 1122 raise ValueError( 1123 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1124 ) 1125 if not hasattr(partition_router, "parent_stream_configs"): 1126 raise ValueError( 1127 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1128 ) 1129 1130 if not hasattr(declarative_stream, "incremental_sync"): 1131 raise ValueError( 1132 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1133 ) 1134 1135 return LegacyToPerPartitionStateMigration( 1136 partition_router, # type: ignore # was already checked above 1137 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1138 config, 1139 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1140 ) 1141 1142 def create_session_token_authenticator( 1143 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1144 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1145 decoder = ( 1146 self._create_component_from_model(model=model.decoder, config=config) 1147 if model.decoder 1148 else JsonDecoder(parameters={}) 1149 ) 1150 login_requester = self._create_component_from_model( 1151 model=model.login_requester, 1152 config=config, 1153 name=f"{name}_login_requester", 1154 decoder=decoder, 1155 ) 1156 token_provider = SessionTokenProvider( 1157 login_requester=login_requester, 1158 session_token_path=model.session_token_path, 1159 expiration_duration=parse_duration(model.expiration_duration) 1160 if model.expiration_duration 1161 else None, 1162 parameters=model.parameters or {}, 1163 message_repository=self._message_repository, 1164 decoder=decoder, 1165 ) 1166 if model.request_authentication.type == "Bearer": 1167 return ModelToComponentFactory.create_bearer_authenticator( 1168 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1169 config, 1170 token_provider=token_provider, 1171 ) 1172 else: 1173 return self.create_api_key_authenticator( 1174 ApiKeyAuthenticatorModel( 1175 type="ApiKeyAuthenticator", 1176 api_token="", 1177 inject_into=model.request_authentication.inject_into, 1178 ), # type: ignore # $parameters and headers default to None 1179 config=config, 1180 token_provider=token_provider, 1181 ) 1182 1183 @staticmethod 1184 def create_basic_http_authenticator( 1185 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1186 ) -> BasicHttpAuthenticator: 1187 return BasicHttpAuthenticator( 1188 password=model.password or "", 1189 username=model.username, 1190 config=config, 1191 parameters=model.parameters or {}, 1192 ) 1193 1194 @staticmethod 1195 def create_bearer_authenticator( 1196 model: BearerAuthenticatorModel, 1197 config: Config, 1198 token_provider: Optional[TokenProvider] = None, 1199 **kwargs: Any, 1200 ) -> BearerAuthenticator: 1201 if token_provider is not None and model.api_token != "": 1202 raise ValueError( 1203 "If token_provider is set, api_token is ignored and has to be set to empty string." 1204 ) 1205 return BearerAuthenticator( 1206 token_provider=( 1207 token_provider 1208 if token_provider is not None 1209 else InterpolatedStringTokenProvider( 1210 api_token=model.api_token or "", 1211 config=config, 1212 parameters=model.parameters or {}, 1213 ) 1214 ), 1215 config=config, 1216 parameters=model.parameters or {}, 1217 ) 1218 1219 @staticmethod 1220 def create_dynamic_stream_check_config( 1221 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1222 ) -> DynamicStreamCheckConfig: 1223 return DynamicStreamCheckConfig( 1224 dynamic_stream_name=model.dynamic_stream_name, 1225 stream_count=model.stream_count or 0, 1226 ) 1227 1228 def create_check_stream( 1229 self, model: CheckStreamModel, config: Config, **kwargs: Any 1230 ) -> CheckStream: 1231 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1232 raise ValueError( 1233 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1234 ) 1235 1236 dynamic_streams_check_configs = ( 1237 [ 1238 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1239 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1240 ] 1241 if model.dynamic_streams_check_configs 1242 else [] 1243 ) 1244 1245 return CheckStream( 1246 stream_names=model.stream_names or [], 1247 dynamic_streams_check_configs=dynamic_streams_check_configs, 1248 parameters={}, 1249 ) 1250 1251 @staticmethod 1252 def create_check_dynamic_stream( 1253 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1254 ) -> CheckDynamicStream: 1255 assert model.use_check_availability is not None # for mypy 1256 1257 use_check_availability = model.use_check_availability 1258 1259 return CheckDynamicStream( 1260 stream_count=model.stream_count, 1261 use_check_availability=use_check_availability, 1262 parameters={}, 1263 ) 1264 1265 def create_composite_error_handler( 1266 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1267 ) -> CompositeErrorHandler: 1268 error_handlers = [ 1269 self._create_component_from_model(model=error_handler_model, config=config) 1270 for error_handler_model in model.error_handlers 1271 ] 1272 return CompositeErrorHandler( 1273 error_handlers=error_handlers, parameters=model.parameters or {} 1274 ) 1275 1276 @staticmethod 1277 def create_concurrency_level( 1278 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1279 ) -> ConcurrencyLevel: 1280 return ConcurrencyLevel( 1281 default_concurrency=model.default_concurrency, 1282 max_concurrency=model.max_concurrency, 1283 config=config, 1284 parameters={}, 1285 ) 1286 1287 @staticmethod 1288 def apply_stream_state_migrations( 1289 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1290 ) -> MutableMapping[str, Any]: 1291 if stream_state_migrations: 1292 for state_migration in stream_state_migrations: 1293 if state_migration.should_migrate(stream_state): 1294 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1295 stream_state = dict(state_migration.migrate(stream_state)) 1296 return stream_state 1297 1298 def create_concurrent_cursor_from_datetime_based_cursor( 1299 self, 1300 model_type: Type[BaseModel], 1301 component_definition: ComponentDefinition, 1302 stream_name: str, 1303 stream_namespace: Optional[str], 1304 stream_state: MutableMapping[str, Any], 1305 config: Config, 1306 message_repository: Optional[MessageRepository] = None, 1307 runtime_lookback_window: Optional[datetime.timedelta] = None, 1308 **kwargs: Any, 1309 ) -> ConcurrentCursor: 1310 component_type = component_definition.get("type") 1311 if component_definition.get("type") != model_type.__name__: 1312 raise ValueError( 1313 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1314 ) 1315 1316 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1317 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1318 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1319 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1320 if "$parameters" not in component_definition and "parameters" in component_definition: 1321 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1322 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1323 1324 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1325 raise ValueError( 1326 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1327 ) 1328 1329 model_parameters = datetime_based_cursor_model.parameters or {} 1330 interpolated_cursor_field = InterpolatedString.create( 1331 datetime_based_cursor_model.cursor_field, 1332 parameters=model_parameters, 1333 ) 1334 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1335 1336 interpolated_partition_field_start = InterpolatedString.create( 1337 datetime_based_cursor_model.partition_field_start or "start_time", 1338 parameters=model_parameters, 1339 ) 1340 interpolated_partition_field_end = InterpolatedString.create( 1341 datetime_based_cursor_model.partition_field_end or "end_time", 1342 parameters=model_parameters, 1343 ) 1344 1345 slice_boundary_fields = ( 1346 interpolated_partition_field_start.eval(config=config), 1347 interpolated_partition_field_end.eval(config=config), 1348 ) 1349 1350 datetime_format = datetime_based_cursor_model.datetime_format 1351 1352 cursor_granularity = ( 1353 parse_duration(datetime_based_cursor_model.cursor_granularity) 1354 if datetime_based_cursor_model.cursor_granularity 1355 else None 1356 ) 1357 1358 lookback_window = None 1359 interpolated_lookback_window = ( 1360 InterpolatedString.create( 1361 datetime_based_cursor_model.lookback_window, 1362 parameters=model_parameters, 1363 ) 1364 if datetime_based_cursor_model.lookback_window 1365 else None 1366 ) 1367 if interpolated_lookback_window: 1368 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1369 if evaluated_lookback_window: 1370 lookback_window = parse_duration(evaluated_lookback_window) 1371 1372 connector_state_converter: DateTimeStreamStateConverter 1373 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1374 datetime_format=datetime_format, 1375 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1376 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1377 cursor_granularity=cursor_granularity, 1378 ) 1379 1380 # Adjusts the stream state by applying the runtime lookback window. 1381 # This is used to ensure correct state handling in case of failed partitions. 1382 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1383 if runtime_lookback_window and stream_state_value: 1384 new_stream_state = ( 1385 connector_state_converter.parse_timestamp(stream_state_value) 1386 - runtime_lookback_window 1387 ) 1388 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1389 new_stream_state 1390 ) 1391 1392 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1393 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1394 start_date_runtime_value = self.create_min_max_datetime( 1395 model=datetime_based_cursor_model.start_datetime, config=config 1396 ) 1397 else: 1398 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1399 1400 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1401 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1402 end_date_runtime_value = self.create_min_max_datetime( 1403 model=datetime_based_cursor_model.end_datetime, config=config 1404 ) 1405 else: 1406 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1407 1408 interpolated_start_date = MinMaxDatetime.create( 1409 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1410 parameters=datetime_based_cursor_model.parameters, 1411 ) 1412 interpolated_end_date = ( 1413 None 1414 if not end_date_runtime_value 1415 else MinMaxDatetime.create( 1416 end_date_runtime_value, datetime_based_cursor_model.parameters 1417 ) 1418 ) 1419 1420 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1421 if not interpolated_start_date.datetime_format: 1422 interpolated_start_date.datetime_format = datetime_format 1423 if interpolated_end_date and not interpolated_end_date.datetime_format: 1424 interpolated_end_date.datetime_format = datetime_format 1425 1426 start_date = interpolated_start_date.get_datetime(config=config) 1427 end_date_provider = ( 1428 partial(interpolated_end_date.get_datetime, config) 1429 if interpolated_end_date 1430 else connector_state_converter.get_end_provider() 1431 ) 1432 1433 if ( 1434 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1435 ) or ( 1436 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1437 ): 1438 raise ValueError( 1439 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1440 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1441 ) 1442 1443 # When step is not defined, default to a step size from the starting date to the present moment 1444 step_length = datetime.timedelta.max 1445 interpolated_step = ( 1446 InterpolatedString.create( 1447 datetime_based_cursor_model.step, 1448 parameters=model_parameters, 1449 ) 1450 if datetime_based_cursor_model.step 1451 else None 1452 ) 1453 if interpolated_step: 1454 evaluated_step = interpolated_step.eval(config) 1455 if evaluated_step: 1456 step_length = parse_duration(evaluated_step) 1457 1458 clamping_strategy: ClampingStrategy = NoClamping() 1459 if datetime_based_cursor_model.clamping: 1460 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1461 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1462 # object which we want to keep agnostic of being low-code 1463 target = InterpolatedString( 1464 string=datetime_based_cursor_model.clamping.target, 1465 parameters=model_parameters, 1466 ) 1467 evaluated_target = target.eval(config=config) 1468 match evaluated_target: 1469 case "DAY": 1470 clamping_strategy = DayClampingStrategy() 1471 end_date_provider = ClampingEndProvider( 1472 DayClampingStrategy(is_ceiling=False), 1473 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1474 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1475 ) 1476 case "WEEK": 1477 if ( 1478 not datetime_based_cursor_model.clamping.target_details 1479 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1480 ): 1481 raise ValueError( 1482 "Given WEEK clamping, weekday needs to be provided as target_details" 1483 ) 1484 weekday = self._assemble_weekday( 1485 datetime_based_cursor_model.clamping.target_details["weekday"] 1486 ) 1487 clamping_strategy = WeekClampingStrategy(weekday) 1488 end_date_provider = ClampingEndProvider( 1489 WeekClampingStrategy(weekday, is_ceiling=False), 1490 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1491 granularity=cursor_granularity or datetime.timedelta(days=1), 1492 ) 1493 case "MONTH": 1494 clamping_strategy = MonthClampingStrategy() 1495 end_date_provider = ClampingEndProvider( 1496 MonthClampingStrategy(is_ceiling=False), 1497 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1498 granularity=cursor_granularity or datetime.timedelta(days=1), 1499 ) 1500 case _: 1501 raise ValueError( 1502 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1503 ) 1504 1505 return ConcurrentCursor( 1506 stream_name=stream_name, 1507 stream_namespace=stream_namespace, 1508 stream_state=stream_state, 1509 message_repository=message_repository or self._message_repository, 1510 connector_state_manager=self._connector_state_manager, 1511 connector_state_converter=connector_state_converter, 1512 cursor_field=cursor_field, 1513 slice_boundary_fields=slice_boundary_fields, 1514 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1515 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1516 lookback_window=lookback_window, 1517 slice_range=step_length, 1518 cursor_granularity=cursor_granularity, 1519 clamping_strategy=clamping_strategy, 1520 ) 1521 1522 def create_concurrent_cursor_from_incrementing_count_cursor( 1523 self, 1524 model_type: Type[BaseModel], 1525 component_definition: ComponentDefinition, 1526 stream_name: str, 1527 stream_namespace: Optional[str], 1528 stream_state: MutableMapping[str, Any], 1529 config: Config, 1530 message_repository: Optional[MessageRepository] = None, 1531 **kwargs: Any, 1532 ) -> ConcurrentCursor: 1533 component_type = component_definition.get("type") 1534 if component_definition.get("type") != model_type.__name__: 1535 raise ValueError( 1536 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1537 ) 1538 1539 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1540 1541 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1542 raise ValueError( 1543 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1544 ) 1545 1546 interpolated_start_value = ( 1547 InterpolatedString.create( 1548 incrementing_count_cursor_model.start_value, # type: ignore 1549 parameters=incrementing_count_cursor_model.parameters or {}, 1550 ) 1551 if incrementing_count_cursor_model.start_value 1552 else 0 1553 ) 1554 1555 interpolated_cursor_field = InterpolatedString.create( 1556 incrementing_count_cursor_model.cursor_field, 1557 parameters=incrementing_count_cursor_model.parameters or {}, 1558 ) 1559 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1560 1561 connector_state_converter = IncrementingCountStreamStateConverter( 1562 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1563 ) 1564 1565 return ConcurrentCursor( 1566 stream_name=stream_name, 1567 stream_namespace=stream_namespace, 1568 stream_state=stream_state, 1569 message_repository=message_repository or self._message_repository, 1570 connector_state_manager=self._connector_state_manager, 1571 connector_state_converter=connector_state_converter, 1572 cursor_field=cursor_field, 1573 slice_boundary_fields=None, 1574 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1575 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1576 ) 1577 1578 def _assemble_weekday(self, weekday: str) -> Weekday: 1579 match weekday: 1580 case "MONDAY": 1581 return Weekday.MONDAY 1582 case "TUESDAY": 1583 return Weekday.TUESDAY 1584 case "WEDNESDAY": 1585 return Weekday.WEDNESDAY 1586 case "THURSDAY": 1587 return Weekday.THURSDAY 1588 case "FRIDAY": 1589 return Weekday.FRIDAY 1590 case "SATURDAY": 1591 return Weekday.SATURDAY 1592 case "SUNDAY": 1593 return Weekday.SUNDAY 1594 case _: 1595 raise ValueError(f"Unknown weekday {weekday}") 1596 1597 def create_concurrent_cursor_from_perpartition_cursor( 1598 self, 1599 state_manager: ConnectorStateManager, 1600 model_type: Type[BaseModel], 1601 component_definition: ComponentDefinition, 1602 stream_name: str, 1603 stream_namespace: Optional[str], 1604 config: Config, 1605 stream_state: MutableMapping[str, Any], 1606 partition_router: PartitionRouter, 1607 attempt_to_create_cursor_if_not_provided: bool = False, 1608 **kwargs: Any, 1609 ) -> ConcurrentPerPartitionCursor: 1610 component_type = component_definition.get("type") 1611 if component_definition.get("type") != model_type.__name__: 1612 raise ValueError( 1613 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1614 ) 1615 1616 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1617 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1618 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1619 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1620 if "$parameters" not in component_definition and "parameters" in component_definition: 1621 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1622 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1623 1624 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1625 raise ValueError( 1626 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1627 ) 1628 1629 interpolated_cursor_field = InterpolatedString.create( 1630 datetime_based_cursor_model.cursor_field, 1631 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1632 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1633 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1634 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1635 parameters=datetime_based_cursor_model.parameters or {}, 1636 ) 1637 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1638 1639 datetime_format = datetime_based_cursor_model.datetime_format 1640 1641 cursor_granularity = ( 1642 parse_duration(datetime_based_cursor_model.cursor_granularity) 1643 if datetime_based_cursor_model.cursor_granularity 1644 else None 1645 ) 1646 1647 connector_state_converter: DateTimeStreamStateConverter 1648 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1649 datetime_format=datetime_format, 1650 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1651 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1652 cursor_granularity=cursor_granularity, 1653 ) 1654 1655 # Create the cursor factory 1656 cursor_factory = ConcurrentCursorFactory( 1657 partial( 1658 self.create_concurrent_cursor_from_datetime_based_cursor, 1659 state_manager=state_manager, 1660 model_type=model_type, 1661 component_definition=component_definition, 1662 stream_name=stream_name, 1663 stream_namespace=stream_namespace, 1664 config=config, 1665 message_repository=NoopMessageRepository(), 1666 ) 1667 ) 1668 1669 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1670 use_global_cursor = isinstance( 1671 partition_router, GroupingPartitionRouter 1672 ) or component_definition.get("global_substream_cursor", False) 1673 1674 # Return the concurrent cursor and state converter 1675 return ConcurrentPerPartitionCursor( 1676 cursor_factory=cursor_factory, 1677 partition_router=partition_router, 1678 stream_name=stream_name, 1679 stream_namespace=stream_namespace, 1680 stream_state=stream_state, 1681 message_repository=self._message_repository, # type: ignore 1682 connector_state_manager=state_manager, 1683 connector_state_converter=connector_state_converter, 1684 cursor_field=cursor_field, 1685 use_global_cursor=use_global_cursor, 1686 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1687 ) 1688 1689 @staticmethod 1690 def create_constant_backoff_strategy( 1691 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1692 ) -> ConstantBackoffStrategy: 1693 return ConstantBackoffStrategy( 1694 backoff_time_in_seconds=model.backoff_time_in_seconds, 1695 config=config, 1696 parameters=model.parameters or {}, 1697 ) 1698 1699 def create_cursor_pagination( 1700 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1701 ) -> CursorPaginationStrategy: 1702 if isinstance(decoder, PaginationDecoderDecorator): 1703 inner_decoder = decoder.decoder 1704 else: 1705 inner_decoder = decoder 1706 decoder = PaginationDecoderDecorator(decoder=decoder) 1707 1708 if self._is_supported_decoder_for_pagination(inner_decoder): 1709 decoder_to_use = decoder 1710 else: 1711 raise ValueError( 1712 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1713 ) 1714 1715 return CursorPaginationStrategy( 1716 cursor_value=model.cursor_value, 1717 decoder=decoder_to_use, 1718 page_size=model.page_size, 1719 stop_condition=model.stop_condition, 1720 config=config, 1721 parameters=model.parameters or {}, 1722 ) 1723 1724 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1725 """ 1726 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1727 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1728 :param model: The Pydantic model of the custom component being created 1729 :param config: The custom defined connector config 1730 :return: The declarative component built from the Pydantic model to be used at runtime 1731 """ 1732 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1733 component_fields = get_type_hints(custom_component_class) 1734 model_args = model.dict() 1735 model_args["config"] = config 1736 1737 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1738 # we defer to these arguments over the component's definition 1739 for key, arg in kwargs.items(): 1740 model_args[key] = arg 1741 1742 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1743 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1744 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1745 for model_field, model_value in model_args.items(): 1746 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1747 if ( 1748 isinstance(model_value, dict) 1749 and "type" not in model_value 1750 and model_field in component_fields 1751 ): 1752 derived_type = self._derive_component_type_from_type_hints( 1753 component_fields.get(model_field) 1754 ) 1755 if derived_type: 1756 model_value["type"] = derived_type 1757 1758 if self._is_component(model_value): 1759 model_args[model_field] = self._create_nested_component( 1760 model, 1761 model_field, 1762 model_value, 1763 config, 1764 **kwargs, 1765 ) 1766 elif isinstance(model_value, list): 1767 vals = [] 1768 for v in model_value: 1769 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1770 derived_type = self._derive_component_type_from_type_hints( 1771 component_fields.get(model_field) 1772 ) 1773 if derived_type: 1774 v["type"] = derived_type 1775 if self._is_component(v): 1776 vals.append( 1777 self._create_nested_component( 1778 model, 1779 model_field, 1780 v, 1781 config, 1782 **kwargs, 1783 ) 1784 ) 1785 else: 1786 vals.append(v) 1787 model_args[model_field] = vals 1788 1789 kwargs = { 1790 class_field: model_args[class_field] 1791 for class_field in component_fields.keys() 1792 if class_field in model_args 1793 } 1794 return custom_component_class(**kwargs) 1795 1796 @staticmethod 1797 def _get_class_from_fully_qualified_class_name( 1798 full_qualified_class_name: str, 1799 ) -> Any: 1800 """Get a class from its fully qualified name. 1801 1802 If a custom components module is needed, we assume it is already registered - probably 1803 as `source_declarative_manifest.components` or `components`. 1804 1805 Args: 1806 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1807 1808 Returns: 1809 Any: The class object. 1810 1811 Raises: 1812 ValueError: If the class cannot be loaded. 1813 """ 1814 split = full_qualified_class_name.split(".") 1815 module_name_full = ".".join(split[:-1]) 1816 class_name = split[-1] 1817 1818 try: 1819 module_ref = importlib.import_module(module_name_full) 1820 except ModuleNotFoundError as e: 1821 if split[0] == "source_declarative_manifest": 1822 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1823 try: 1824 import os 1825 1826 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1827 module_ref = importlib.import_module( 1828 module_name_with_source_declarative_manifest 1829 ) 1830 except ModuleNotFoundError: 1831 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1832 else: 1833 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1834 1835 try: 1836 return getattr(module_ref, class_name) 1837 except AttributeError as e: 1838 raise ValueError( 1839 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1840 ) from e 1841 1842 @staticmethod 1843 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1844 interface = field_type 1845 while True: 1846 origin = get_origin(interface) 1847 if origin: 1848 # Unnest types until we reach the raw type 1849 # List[T] -> T 1850 # Optional[List[T]] -> T 1851 args = get_args(interface) 1852 interface = args[0] 1853 else: 1854 break 1855 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1856 return interface.__name__ 1857 return None 1858 1859 @staticmethod 1860 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1861 if not cls: 1862 return False 1863 return cls.__module__ == "builtins" 1864 1865 @staticmethod 1866 def _extract_missing_parameters(error: TypeError) -> List[str]: 1867 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1868 if parameter_search: 1869 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1870 else: 1871 return [] 1872 1873 def _create_nested_component( 1874 self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any 1875 ) -> Any: 1876 type_name = model_value.get("type", None) 1877 if not type_name: 1878 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1879 return model_value 1880 1881 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1882 if model_type: 1883 parsed_model = model_type.parse_obj(model_value) 1884 try: 1885 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1886 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1887 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1888 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1889 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1890 # are needed by a component and could not be shared. 1891 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1892 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1893 model_parameters = model_value.get("$parameters", {}) 1894 matching_parameters = { 1895 kwarg: model_parameters[kwarg] 1896 for kwarg in constructor_kwargs 1897 if kwarg in model_parameters 1898 } 1899 matching_kwargs = { 1900 kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs 1901 } 1902 return self._create_component_from_model( 1903 model=parsed_model, config=config, **(matching_parameters | matching_kwargs) 1904 ) 1905 except TypeError as error: 1906 missing_parameters = self._extract_missing_parameters(error) 1907 if missing_parameters: 1908 raise ValueError( 1909 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1910 + ", ".join( 1911 ( 1912 f"{type_name}.$parameters.{parameter}" 1913 for parameter in missing_parameters 1914 ) 1915 ) 1916 ) 1917 raise TypeError( 1918 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1919 ) 1920 else: 1921 raise ValueError( 1922 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1923 ) 1924 1925 @staticmethod 1926 def _is_component(model_value: Any) -> bool: 1927 return isinstance(model_value, dict) and model_value.get("type") is not None 1928 1929 def create_default_stream( 1930 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 1931 ) -> AbstractStream: 1932 primary_key = model.primary_key.__root__ if model.primary_key else None 1933 self._migrate_state(model, config) 1934 1935 partition_router = self._build_stream_slicer_from_partition_router( 1936 model.retriever, 1937 config, 1938 stream_name=model.name, 1939 **kwargs, 1940 ) 1941 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 1942 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1943 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 1944 1945 end_time_option = ( 1946 self._create_component_from_model( 1947 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1948 ) 1949 if cursor_model.end_time_option 1950 else None 1951 ) 1952 start_time_option = ( 1953 self._create_component_from_model( 1954 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1955 ) 1956 if cursor_model.start_time_option 1957 else None 1958 ) 1959 1960 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 1961 start_time_option=start_time_option, 1962 end_time_option=end_time_option, 1963 partition_field_start=cursor_model.partition_field_start, 1964 partition_field_end=cursor_model.partition_field_end, 1965 config=config, 1966 parameters=model.parameters or {}, 1967 ) 1968 request_options_provider = ( 1969 datetime_request_options_provider 1970 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 1971 else PerPartitionRequestOptionsProvider( 1972 partition_router, datetime_request_options_provider 1973 ) 1974 ) 1975 elif model.incremental_sync and isinstance( 1976 model.incremental_sync, IncrementingCountCursorModel 1977 ): 1978 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 1979 raise ValueError( 1980 "PerPartition does not support per partition states because switching to global state is time based" 1981 ) 1982 1983 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1984 1985 start_time_option = ( 1986 self._create_component_from_model( 1987 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1988 config, 1989 parameters=cursor_model.parameters or {}, 1990 ) 1991 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1992 else None 1993 ) 1994 1995 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1996 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1997 partition_field_start = "start" 1998 1999 request_options_provider = DatetimeBasedRequestOptionsProvider( 2000 start_time_option=start_time_option, 2001 partition_field_start=partition_field_start, 2002 config=config, 2003 parameters=model.parameters or {}, 2004 ) 2005 else: 2006 request_options_provider = None 2007 2008 transformations = [] 2009 if model.transformations: 2010 for transformation_model in model.transformations: 2011 transformations.append( 2012 self._create_component_from_model(model=transformation_model, config=config) 2013 ) 2014 file_uploader = None 2015 if model.file_uploader: 2016 file_uploader = self._create_component_from_model( 2017 model=model.file_uploader, config=config 2018 ) 2019 2020 stream_slicer: ConcurrentStreamSlicer = ( 2021 partition_router 2022 if isinstance(concurrent_cursor, FinalStateCursor) 2023 else concurrent_cursor 2024 ) 2025 2026 retriever = self._create_component_from_model( 2027 model=model.retriever, 2028 config=config, 2029 name=model.name, 2030 primary_key=primary_key, 2031 request_options_provider=request_options_provider, 2032 stream_slicer=stream_slicer, 2033 partition_router=partition_router, 2034 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2035 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2036 cursor=concurrent_cursor, 2037 transformations=transformations, 2038 file_uploader=file_uploader, 2039 incremental_sync=model.incremental_sync, 2040 ) 2041 if isinstance(retriever, AsyncRetriever): 2042 stream_slicer = retriever.stream_slicer 2043 2044 schema_loader: SchemaLoader 2045 if model.schema_loader and isinstance(model.schema_loader, list): 2046 nested_schema_loaders = [ 2047 self._create_component_from_model(model=nested_schema_loader, config=config) 2048 for nested_schema_loader in model.schema_loader 2049 ] 2050 schema_loader = CompositeSchemaLoader( 2051 schema_loaders=nested_schema_loaders, parameters={} 2052 ) 2053 elif model.schema_loader: 2054 schema_loader = self._create_component_from_model( 2055 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2056 config=config, 2057 ) 2058 else: 2059 options = model.parameters or {} 2060 if "name" not in options: 2061 options["name"] = model.name 2062 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2063 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2064 2065 stream_name = model.name or "" 2066 return DefaultStream( 2067 partition_generator=StreamSlicerPartitionGenerator( 2068 DeclarativePartitionFactory( 2069 stream_name, 2070 schema_loader, 2071 retriever, 2072 self._message_repository, 2073 ), 2074 stream_slicer, 2075 slice_limit=self._limit_slices_fetched, 2076 ), 2077 name=stream_name, 2078 json_schema=schema_loader.get_json_schema, 2079 primary_key=get_primary_key_from_stream(primary_key), 2080 cursor_field=concurrent_cursor.cursor_field.cursor_field_key 2081 if hasattr(concurrent_cursor, "cursor_field") 2082 else "", # FIXME we should have the cursor field has part of the interface of cursor, 2083 logger=logging.getLogger(f"airbyte.{stream_name}"), 2084 cursor=concurrent_cursor, 2085 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2086 ) 2087 2088 def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: 2089 stream_name = model.name or "" 2090 stream_state = self._connector_state_manager.get_stream_state( 2091 stream_name=stream_name, namespace=None 2092 ) 2093 if model.state_migrations: 2094 state_transformations = [ 2095 self._create_component_from_model(state_migration, config, declarative_stream=model) 2096 for state_migration in model.state_migrations 2097 ] 2098 else: 2099 state_transformations = [] 2100 stream_state = self.apply_stream_state_migrations(state_transformations, stream_state) 2101 self._connector_state_manager.update_state_for_stream( 2102 stream_name=stream_name, namespace=None, value=stream_state 2103 ) 2104 2105 def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: 2106 return bool( 2107 model.incremental_sync 2108 and hasattr(model.incremental_sync, "is_data_feed") 2109 and model.incremental_sync.is_data_feed 2110 ) 2111 2112 def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: 2113 return bool( 2114 model.incremental_sync 2115 and hasattr(model.incremental_sync, "is_client_side_incremental") 2116 and model.incremental_sync.is_client_side_incremental 2117 ) 2118 2119 def _build_stream_slicer_from_partition_router( 2120 self, 2121 model: Union[ 2122 AsyncRetrieverModel, 2123 CustomRetrieverModel, 2124 SimpleRetrieverModel, 2125 ], 2126 config: Config, 2127 stream_name: Optional[str] = None, 2128 **kwargs: Any, 2129 ) -> PartitionRouter: 2130 if ( 2131 hasattr(model, "partition_router") 2132 and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel)) 2133 and model.partition_router 2134 ): 2135 stream_slicer_model = model.partition_router 2136 if isinstance(stream_slicer_model, list): 2137 return CartesianProductStreamSlicer( 2138 [ 2139 self._create_component_from_model( 2140 model=slicer, config=config, stream_name=stream_name or "" 2141 ) 2142 for slicer in stream_slicer_model 2143 ], 2144 parameters={}, 2145 ) 2146 elif isinstance(stream_slicer_model, dict): 2147 # partition router comes from CustomRetrieverModel therefore has not been parsed as a model 2148 params = stream_slicer_model.get("$parameters") 2149 if not isinstance(params, dict): 2150 params = {} 2151 stream_slicer_model["$parameters"] = params 2152 2153 if stream_name is not None: 2154 params["stream_name"] = stream_name 2155 2156 return self._create_nested_component( # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices` 2157 model, 2158 "partition_router", 2159 stream_slicer_model, 2160 config, 2161 **kwargs, 2162 ) 2163 else: 2164 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2165 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2166 ) 2167 return SinglePartitionRouter(parameters={}) 2168 2169 def _build_concurrent_cursor( 2170 self, 2171 model: DeclarativeStreamModel, 2172 stream_slicer: Optional[PartitionRouter], 2173 config: Config, 2174 ) -> Cursor: 2175 stream_name = model.name or "" 2176 stream_state = self._connector_state_manager.get_stream_state(stream_name, None) 2177 2178 if ( 2179 model.incremental_sync 2180 and stream_slicer 2181 and not isinstance(stream_slicer, SinglePartitionRouter) 2182 ): 2183 if isinstance(model.incremental_sync, IncrementingCountCursorModel): 2184 # We don't currently support usage of partition routing and IncrementingCountCursor at the 2185 # same time because we didn't solve for design questions like what the lookback window would 2186 # be as well as global cursor fall backs. We have not seen customers that have needed both 2187 # at the same time yet and are currently punting on this until we need to solve it. 2188 raise ValueError( 2189 f"The low-code framework does not currently support usage of a PartitionRouter and an IncrementingCountCursor at the same time. Please specify only one of these options for stream {stream_name}." 2190 ) 2191 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2192 state_manager=self._connector_state_manager, 2193 model_type=DatetimeBasedCursorModel, 2194 component_definition=model.incremental_sync.__dict__, 2195 stream_name=stream_name, 2196 stream_state=stream_state, 2197 stream_namespace=None, 2198 config=config or {}, 2199 partition_router=stream_slicer, 2200 attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? 2201 ) 2202 elif model.incremental_sync: 2203 if type(model.incremental_sync) == IncrementingCountCursorModel: 2204 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2205 model_type=IncrementingCountCursorModel, 2206 component_definition=model.incremental_sync.__dict__, 2207 stream_name=stream_name, 2208 stream_namespace=None, 2209 stream_state=stream_state, 2210 config=config or {}, 2211 ) 2212 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2213 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2214 model_type=type(model.incremental_sync), 2215 component_definition=model.incremental_sync.__dict__, 2216 stream_name=stream_name, 2217 stream_namespace=None, 2218 stream_state=stream_state, 2219 config=config or {}, 2220 attempt_to_create_cursor_if_not_provided=True, 2221 ) 2222 else: 2223 raise ValueError( 2224 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2225 ) 2226 return FinalStateCursor(stream_name, None, self._message_repository) 2227 2228 def create_default_error_handler( 2229 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2230 ) -> DefaultErrorHandler: 2231 backoff_strategies = [] 2232 if model.backoff_strategies: 2233 for backoff_strategy_model in model.backoff_strategies: 2234 backoff_strategies.append( 2235 self._create_component_from_model(model=backoff_strategy_model, config=config) 2236 ) 2237 2238 response_filters = [] 2239 if model.response_filters: 2240 for response_filter_model in model.response_filters: 2241 response_filters.append( 2242 self._create_component_from_model(model=response_filter_model, config=config) 2243 ) 2244 response_filters.append( 2245 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2246 ) 2247 2248 return DefaultErrorHandler( 2249 backoff_strategies=backoff_strategies, 2250 max_retries=model.max_retries, 2251 response_filters=response_filters, 2252 config=config, 2253 parameters=model.parameters or {}, 2254 ) 2255 2256 def create_default_paginator( 2257 self, 2258 model: DefaultPaginatorModel, 2259 config: Config, 2260 *, 2261 url_base: str, 2262 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2263 decoder: Optional[Decoder] = None, 2264 cursor_used_for_stop_condition: Optional[Cursor] = None, 2265 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2266 if decoder: 2267 if self._is_supported_decoder_for_pagination(decoder): 2268 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2269 else: 2270 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2271 else: 2272 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2273 page_size_option = ( 2274 self._create_component_from_model(model=model.page_size_option, config=config) 2275 if model.page_size_option 2276 else None 2277 ) 2278 page_token_option = ( 2279 self._create_component_from_model(model=model.page_token_option, config=config) 2280 if model.page_token_option 2281 else None 2282 ) 2283 pagination_strategy = self._create_component_from_model( 2284 model=model.pagination_strategy, 2285 config=config, 2286 decoder=decoder_to_use, 2287 extractor_model=extractor_model, 2288 ) 2289 if cursor_used_for_stop_condition: 2290 pagination_strategy = StopConditionPaginationStrategyDecorator( 2291 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2292 ) 2293 paginator = DefaultPaginator( 2294 decoder=decoder_to_use, 2295 page_size_option=page_size_option, 2296 page_token_option=page_token_option, 2297 pagination_strategy=pagination_strategy, 2298 url_base=url_base, 2299 config=config, 2300 parameters=model.parameters or {}, 2301 ) 2302 if self._limit_pages_fetched_per_slice: 2303 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2304 return paginator 2305 2306 def create_dpath_extractor( 2307 self, 2308 model: DpathExtractorModel, 2309 config: Config, 2310 decoder: Optional[Decoder] = None, 2311 **kwargs: Any, 2312 ) -> DpathExtractor: 2313 if decoder: 2314 decoder_to_use = decoder 2315 else: 2316 decoder_to_use = JsonDecoder(parameters={}) 2317 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2318 return DpathExtractor( 2319 decoder=decoder_to_use, 2320 field_path=model_field_path, 2321 config=config, 2322 parameters=model.parameters or {}, 2323 ) 2324 2325 @staticmethod 2326 def create_response_to_file_extractor( 2327 model: ResponseToFileExtractorModel, 2328 **kwargs: Any, 2329 ) -> ResponseToFileExtractor: 2330 return ResponseToFileExtractor(parameters=model.parameters or {}) 2331 2332 @staticmethod 2333 def create_exponential_backoff_strategy( 2334 model: ExponentialBackoffStrategyModel, config: Config 2335 ) -> ExponentialBackoffStrategy: 2336 return ExponentialBackoffStrategy( 2337 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2338 ) 2339 2340 @staticmethod 2341 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2342 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2343 2344 def create_http_requester( 2345 self, 2346 model: HttpRequesterModel, 2347 config: Config, 2348 decoder: Decoder = JsonDecoder(parameters={}), 2349 query_properties_key: Optional[str] = None, 2350 use_cache: Optional[bool] = None, 2351 *, 2352 name: str, 2353 ) -> HttpRequester: 2354 authenticator = ( 2355 self._create_component_from_model( 2356 model=model.authenticator, 2357 config=config, 2358 url_base=model.url or model.url_base, 2359 name=name, 2360 decoder=decoder, 2361 ) 2362 if model.authenticator 2363 else None 2364 ) 2365 error_handler = ( 2366 self._create_component_from_model(model=model.error_handler, config=config) 2367 if model.error_handler 2368 else DefaultErrorHandler( 2369 backoff_strategies=[], 2370 response_filters=[], 2371 config=config, 2372 parameters=model.parameters or {}, 2373 ) 2374 ) 2375 2376 api_budget = self._api_budget 2377 2378 request_options_provider = InterpolatedRequestOptionsProvider( 2379 request_body=model.request_body, 2380 request_body_data=model.request_body_data, 2381 request_body_json=model.request_body_json, 2382 request_headers=model.request_headers, 2383 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2384 query_properties_key=query_properties_key, 2385 config=config, 2386 parameters=model.parameters or {}, 2387 ) 2388 2389 assert model.use_cache is not None # for mypy 2390 assert model.http_method is not None # for mypy 2391 2392 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2393 2394 return HttpRequester( 2395 name=name, 2396 url=model.url, 2397 url_base=model.url_base, 2398 path=model.path, 2399 authenticator=authenticator, 2400 error_handler=error_handler, 2401 api_budget=api_budget, 2402 http_method=HttpMethod[model.http_method.value], 2403 request_options_provider=request_options_provider, 2404 config=config, 2405 disable_retries=self._disable_retries, 2406 parameters=model.parameters or {}, 2407 message_repository=self._message_repository, 2408 use_cache=should_use_cache, 2409 decoder=decoder, 2410 stream_response=decoder.is_stream_response() if decoder else False, 2411 ) 2412 2413 @staticmethod 2414 def create_http_response_filter( 2415 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2416 ) -> HttpResponseFilter: 2417 if model.action: 2418 action = ResponseAction(model.action.value) 2419 else: 2420 action = None 2421 2422 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2423 2424 http_codes = ( 2425 set(model.http_codes) if model.http_codes else set() 2426 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2427 2428 return HttpResponseFilter( 2429 action=action, 2430 failure_type=failure_type, 2431 error_message=model.error_message or "", 2432 error_message_contains=model.error_message_contains or "", 2433 http_codes=http_codes, 2434 predicate=model.predicate or "", 2435 config=config, 2436 parameters=model.parameters or {}, 2437 ) 2438 2439 @staticmethod 2440 def create_inline_schema_loader( 2441 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2442 ) -> InlineSchemaLoader: 2443 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2444 2445 def create_complex_field_type( 2446 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2447 ) -> ComplexFieldType: 2448 items = ( 2449 self._create_component_from_model(model=model.items, config=config) 2450 if isinstance(model.items, ComplexFieldTypeModel) 2451 else model.items 2452 ) 2453 2454 return ComplexFieldType(field_type=model.field_type, items=items) 2455 2456 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2457 target_type = ( 2458 self._create_component_from_model(model=model.target_type, config=config) 2459 if isinstance(model.target_type, ComplexFieldTypeModel) 2460 else model.target_type 2461 ) 2462 2463 return TypesMap( 2464 target_type=target_type, 2465 current_type=model.current_type, 2466 condition=model.condition if model.condition is not None else "True", 2467 ) 2468 2469 def create_schema_type_identifier( 2470 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2471 ) -> SchemaTypeIdentifier: 2472 types_mapping = [] 2473 if model.types_mapping: 2474 types_mapping.extend( 2475 [ 2476 self._create_component_from_model(types_map, config=config) 2477 for types_map in model.types_mapping 2478 ] 2479 ) 2480 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2481 [x for x in model.schema_pointer] if model.schema_pointer else [] 2482 ) 2483 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2484 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2485 [x for x in model.type_pointer] if model.type_pointer else None 2486 ) 2487 2488 return SchemaTypeIdentifier( 2489 schema_pointer=model_schema_pointer, 2490 key_pointer=model_key_pointer, 2491 type_pointer=model_type_pointer, 2492 types_mapping=types_mapping, 2493 parameters=model.parameters or {}, 2494 ) 2495 2496 def create_dynamic_schema_loader( 2497 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2498 ) -> DynamicSchemaLoader: 2499 schema_transformations = [] 2500 if model.schema_transformations: 2501 for transformation_model in model.schema_transformations: 2502 schema_transformations.append( 2503 self._create_component_from_model(model=transformation_model, config=config) 2504 ) 2505 name = "dynamic_properties" 2506 retriever = self._create_component_from_model( 2507 model=model.retriever, 2508 config=config, 2509 name=name, 2510 primary_key=None, 2511 partition_router=self._build_stream_slicer_from_partition_router( 2512 model.retriever, config 2513 ), 2514 transformations=[], 2515 use_cache=True, 2516 log_formatter=( 2517 lambda response: format_http_message( 2518 response, 2519 f"Schema loader '{name}' request", 2520 f"Request performed in order to extract schema.", 2521 name, 2522 is_auxiliary=True, 2523 ) 2524 ), 2525 ) 2526 schema_type_identifier = self._create_component_from_model( 2527 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2528 ) 2529 schema_filter = ( 2530 self._create_component_from_model( 2531 model.schema_filter, config=config, parameters=model.parameters or {} 2532 ) 2533 if model.schema_filter is not None 2534 else None 2535 ) 2536 2537 return DynamicSchemaLoader( 2538 retriever=retriever, 2539 config=config, 2540 schema_transformations=schema_transformations, 2541 schema_filter=schema_filter, 2542 schema_type_identifier=schema_type_identifier, 2543 parameters=model.parameters or {}, 2544 ) 2545 2546 @staticmethod 2547 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2548 return JsonDecoder(parameters={}) 2549 2550 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2551 return CompositeRawDecoder( 2552 parser=ModelToComponentFactory._get_parser(model, config), 2553 stream_response=False if self._emit_connector_builder_messages else True, 2554 ) 2555 2556 def create_jsonl_decoder( 2557 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2558 ) -> Decoder: 2559 return CompositeRawDecoder( 2560 parser=ModelToComponentFactory._get_parser(model, config), 2561 stream_response=False if self._emit_connector_builder_messages else True, 2562 ) 2563 2564 def create_gzip_decoder( 2565 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2566 ) -> Decoder: 2567 _compressed_response_types = { 2568 "gzip", 2569 "x-gzip", 2570 "gzip, deflate", 2571 "x-gzip, deflate", 2572 "application/zip", 2573 "application/gzip", 2574 "application/x-gzip", 2575 "application/x-zip-compressed", 2576 } 2577 2578 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2579 2580 if self._emit_connector_builder_messages: 2581 # This is very surprising but if the response is not streamed, 2582 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2583 # which uses urllib3 directly and does not uncompress the data. 2584 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2585 2586 return CompositeRawDecoder.by_headers( 2587 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2588 stream_response=True, 2589 fallback_parser=gzip_parser.inner_parser, 2590 ) 2591 2592 @staticmethod 2593 def create_iterable_decoder( 2594 model: IterableDecoderModel, config: Config, **kwargs: Any 2595 ) -> IterableDecoder: 2596 return IterableDecoder(parameters={}) 2597 2598 @staticmethod 2599 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2600 return XmlDecoder(parameters={}) 2601 2602 def create_zipfile_decoder( 2603 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2604 ) -> ZipfileDecoder: 2605 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2606 2607 @staticmethod 2608 def _get_parser(model: BaseModel, config: Config) -> Parser: 2609 if isinstance(model, JsonDecoderModel): 2610 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2611 return JsonParser() 2612 elif isinstance(model, JsonlDecoderModel): 2613 return JsonLineParser() 2614 elif isinstance(model, CsvDecoderModel): 2615 return CsvParser( 2616 encoding=model.encoding, 2617 delimiter=model.delimiter, 2618 set_values_to_none=model.set_values_to_none, 2619 ) 2620 elif isinstance(model, GzipDecoderModel): 2621 return GzipParser( 2622 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2623 ) 2624 elif isinstance( 2625 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2626 ): 2627 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2628 2629 raise ValueError(f"Unknown decoder type {model}") 2630 2631 @staticmethod 2632 def create_json_file_schema_loader( 2633 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2634 ) -> JsonFileSchemaLoader: 2635 return JsonFileSchemaLoader( 2636 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2637 ) 2638 2639 def create_jwt_authenticator( 2640 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2641 ) -> JwtAuthenticator: 2642 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2643 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2644 request_option = ( 2645 self._create_component_from_model(model.request_option, config) 2646 if model.request_option 2647 else None 2648 ) 2649 return JwtAuthenticator( 2650 config=config, 2651 parameters=model.parameters or {}, 2652 algorithm=JwtAlgorithm(model.algorithm.value), 2653 secret_key=model.secret_key, 2654 base64_encode_secret_key=model.base64_encode_secret_key, 2655 token_duration=model.token_duration, 2656 header_prefix=model.header_prefix, 2657 kid=jwt_headers.kid, 2658 typ=jwt_headers.typ, 2659 cty=jwt_headers.cty, 2660 iss=jwt_payload.iss, 2661 sub=jwt_payload.sub, 2662 aud=jwt_payload.aud, 2663 additional_jwt_headers=model.additional_jwt_headers, 2664 additional_jwt_payload=model.additional_jwt_payload, 2665 passphrase=model.passphrase, 2666 request_option=request_option, 2667 ) 2668 2669 def create_list_partition_router( 2670 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2671 ) -> ListPartitionRouter: 2672 request_option = ( 2673 self._create_component_from_model(model.request_option, config) 2674 if model.request_option 2675 else None 2676 ) 2677 return ListPartitionRouter( 2678 cursor_field=model.cursor_field, 2679 request_option=request_option, 2680 values=model.values, 2681 config=config, 2682 parameters=model.parameters or {}, 2683 ) 2684 2685 @staticmethod 2686 def create_min_max_datetime( 2687 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2688 ) -> MinMaxDatetime: 2689 return MinMaxDatetime( 2690 datetime=model.datetime, 2691 datetime_format=model.datetime_format or "", 2692 max_datetime=model.max_datetime or "", 2693 min_datetime=model.min_datetime or "", 2694 parameters=model.parameters or {}, 2695 ) 2696 2697 @staticmethod 2698 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2699 return NoAuth(parameters=model.parameters or {}) 2700 2701 @staticmethod 2702 def create_no_pagination( 2703 model: NoPaginationModel, config: Config, **kwargs: Any 2704 ) -> NoPagination: 2705 return NoPagination(parameters={}) 2706 2707 def create_oauth_authenticator( 2708 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2709 ) -> DeclarativeOauth2Authenticator: 2710 profile_assertion = ( 2711 self._create_component_from_model(model.profile_assertion, config=config) 2712 if model.profile_assertion 2713 else None 2714 ) 2715 2716 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2717 self._get_refresh_token_error_information(model) 2718 ) 2719 if model.refresh_token_updater: 2720 # ignore type error because fixing it would have a lot of dependencies, revisit later 2721 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2722 config, 2723 InterpolatedString.create( 2724 model.token_refresh_endpoint, # type: ignore 2725 parameters=model.parameters or {}, 2726 ).eval(config), 2727 access_token_name=InterpolatedString.create( 2728 model.access_token_name or "access_token", parameters=model.parameters or {} 2729 ).eval(config), 2730 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2731 expires_in_name=InterpolatedString.create( 2732 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2733 ).eval(config), 2734 client_id_name=InterpolatedString.create( 2735 model.client_id_name or "client_id", parameters=model.parameters or {} 2736 ).eval(config), 2737 client_id=InterpolatedString.create( 2738 model.client_id, parameters=model.parameters or {} 2739 ).eval(config) 2740 if model.client_id 2741 else model.client_id, 2742 client_secret_name=InterpolatedString.create( 2743 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2744 ).eval(config), 2745 client_secret=InterpolatedString.create( 2746 model.client_secret, parameters=model.parameters or {} 2747 ).eval(config) 2748 if model.client_secret 2749 else model.client_secret, 2750 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2751 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2752 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2753 grant_type_name=InterpolatedString.create( 2754 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2755 ).eval(config), 2756 grant_type=InterpolatedString.create( 2757 model.grant_type or "refresh_token", parameters=model.parameters or {} 2758 ).eval(config), 2759 refresh_request_body=InterpolatedMapping( 2760 model.refresh_request_body or {}, parameters=model.parameters or {} 2761 ).eval(config), 2762 refresh_request_headers=InterpolatedMapping( 2763 model.refresh_request_headers or {}, parameters=model.parameters or {} 2764 ).eval(config), 2765 scopes=model.scopes, 2766 token_expiry_date_format=model.token_expiry_date_format, 2767 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2768 message_repository=self._message_repository, 2769 refresh_token_error_status_codes=refresh_token_error_status_codes, 2770 refresh_token_error_key=refresh_token_error_key, 2771 refresh_token_error_values=refresh_token_error_values, 2772 ) 2773 # ignore type error because fixing it would have a lot of dependencies, revisit later 2774 return DeclarativeOauth2Authenticator( # type: ignore 2775 access_token_name=model.access_token_name or "access_token", 2776 access_token_value=model.access_token_value, 2777 client_id_name=model.client_id_name or "client_id", 2778 client_id=model.client_id, 2779 client_secret_name=model.client_secret_name or "client_secret", 2780 client_secret=model.client_secret, 2781 expires_in_name=model.expires_in_name or "expires_in", 2782 grant_type_name=model.grant_type_name or "grant_type", 2783 grant_type=model.grant_type or "refresh_token", 2784 refresh_request_body=model.refresh_request_body, 2785 refresh_request_headers=model.refresh_request_headers, 2786 refresh_token_name=model.refresh_token_name or "refresh_token", 2787 refresh_token=model.refresh_token, 2788 scopes=model.scopes, 2789 token_expiry_date=model.token_expiry_date, 2790 token_expiry_date_format=model.token_expiry_date_format, 2791 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2792 token_refresh_endpoint=model.token_refresh_endpoint, 2793 config=config, 2794 parameters=model.parameters or {}, 2795 message_repository=self._message_repository, 2796 profile_assertion=profile_assertion, 2797 use_profile_assertion=model.use_profile_assertion, 2798 refresh_token_error_status_codes=refresh_token_error_status_codes, 2799 refresh_token_error_key=refresh_token_error_key, 2800 refresh_token_error_values=refresh_token_error_values, 2801 ) 2802 2803 @staticmethod 2804 def _get_refresh_token_error_information( 2805 model: OAuthAuthenticatorModel, 2806 ) -> Tuple[Tuple[int, ...], str, Tuple[str, ...]]: 2807 """ 2808 In a previous version of the CDK, the auth error as config_error was only done if a refresh token updater was 2809 defined. As a transition, we added those fields on the OAuthAuthenticatorModel. This method ensures that the 2810 information is defined only once and return the right fields. 2811 """ 2812 refresh_token_updater = model.refresh_token_updater 2813 is_defined_on_refresh_token_updated = refresh_token_updater and ( 2814 refresh_token_updater.refresh_token_error_status_codes 2815 or refresh_token_updater.refresh_token_error_key 2816 or refresh_token_updater.refresh_token_error_values 2817 ) 2818 is_defined_on_oauth_authenticator = ( 2819 model.refresh_token_error_status_codes 2820 or model.refresh_token_error_key 2821 or model.refresh_token_error_values 2822 ) 2823 if is_defined_on_refresh_token_updated and is_defined_on_oauth_authenticator: 2824 raise ValueError( 2825 "refresh_token_error should either be defined on the OAuthAuthenticatorModel or the RefreshTokenUpdaterModel, not both" 2826 ) 2827 2828 if is_defined_on_refresh_token_updated: 2829 not_optional_refresh_token_updater: RefreshTokenUpdaterModel = refresh_token_updater # type: ignore # we know from the condition that this is not None 2830 return ( 2831 tuple(not_optional_refresh_token_updater.refresh_token_error_status_codes) 2832 if not_optional_refresh_token_updater.refresh_token_error_status_codes 2833 else (), 2834 not_optional_refresh_token_updater.refresh_token_error_key or "", 2835 tuple(not_optional_refresh_token_updater.refresh_token_error_values) 2836 if not_optional_refresh_token_updater.refresh_token_error_values 2837 else (), 2838 ) 2839 elif is_defined_on_oauth_authenticator: 2840 return ( 2841 tuple(model.refresh_token_error_status_codes) 2842 if model.refresh_token_error_status_codes 2843 else (), 2844 model.refresh_token_error_key or "", 2845 tuple(model.refresh_token_error_values) if model.refresh_token_error_values else (), 2846 ) 2847 2848 # returning default values we think cover most cases 2849 return (400,), "error", ("invalid_grant", "invalid_permissions") 2850 2851 def create_offset_increment( 2852 self, 2853 model: OffsetIncrementModel, 2854 config: Config, 2855 decoder: Decoder, 2856 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2857 **kwargs: Any, 2858 ) -> OffsetIncrement: 2859 if isinstance(decoder, PaginationDecoderDecorator): 2860 inner_decoder = decoder.decoder 2861 else: 2862 inner_decoder = decoder 2863 decoder = PaginationDecoderDecorator(decoder=decoder) 2864 2865 if self._is_supported_decoder_for_pagination(inner_decoder): 2866 decoder_to_use = decoder 2867 else: 2868 raise ValueError( 2869 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2870 ) 2871 2872 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2873 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2874 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2875 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2876 # When we have more time to investigate we can look into reusing the same component. 2877 extractor = ( 2878 self._create_component_from_model( 2879 model=extractor_model, config=config, decoder=decoder_to_use 2880 ) 2881 if extractor_model 2882 else None 2883 ) 2884 2885 return OffsetIncrement( 2886 page_size=model.page_size, 2887 config=config, 2888 decoder=decoder_to_use, 2889 extractor=extractor, 2890 inject_on_first_request=model.inject_on_first_request or False, 2891 parameters=model.parameters or {}, 2892 ) 2893 2894 @staticmethod 2895 def create_page_increment( 2896 model: PageIncrementModel, config: Config, **kwargs: Any 2897 ) -> PageIncrement: 2898 return PageIncrement( 2899 page_size=model.page_size, 2900 config=config, 2901 start_from_page=model.start_from_page or 0, 2902 inject_on_first_request=model.inject_on_first_request or False, 2903 parameters=model.parameters or {}, 2904 ) 2905 2906 def create_parent_stream_config( 2907 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 2908 ) -> ParentStreamConfig: 2909 declarative_stream = self._create_component_from_model( 2910 model.stream, 2911 config=config, 2912 is_parent=True, 2913 **kwargs, 2914 ) 2915 request_option = ( 2916 self._create_component_from_model(model.request_option, config=config) 2917 if model.request_option 2918 else None 2919 ) 2920 2921 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2922 raise ValueError( 2923 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2924 ) 2925 2926 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2927 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2928 ) 2929 2930 return ParentStreamConfig( 2931 parent_key=model.parent_key, 2932 request_option=request_option, 2933 stream=declarative_stream, 2934 partition_field=model.partition_field, 2935 config=config, 2936 incremental_dependency=model.incremental_dependency or False, 2937 parameters=model.parameters or {}, 2938 extra_fields=model.extra_fields, 2939 lazy_read_pointer=model_lazy_read_pointer, 2940 ) 2941 2942 def create_properties_from_endpoint( 2943 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2944 ) -> PropertiesFromEndpoint: 2945 retriever = self._create_component_from_model( 2946 model=model.retriever, 2947 config=config, 2948 name="dynamic_properties", 2949 primary_key=None, 2950 stream_slicer=None, 2951 transformations=[], 2952 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2953 ) 2954 return PropertiesFromEndpoint( 2955 property_field_path=model.property_field_path, 2956 retriever=retriever, 2957 config=config, 2958 parameters=model.parameters or {}, 2959 ) 2960 2961 def create_property_chunking( 2962 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2963 ) -> PropertyChunking: 2964 record_merge_strategy = ( 2965 self._create_component_from_model( 2966 model=model.record_merge_strategy, config=config, **kwargs 2967 ) 2968 if model.record_merge_strategy 2969 else None 2970 ) 2971 2972 property_limit_type: PropertyLimitType 2973 match model.property_limit_type: 2974 case PropertyLimitTypeModel.property_count: 2975 property_limit_type = PropertyLimitType.property_count 2976 case PropertyLimitTypeModel.characters: 2977 property_limit_type = PropertyLimitType.characters 2978 case _: 2979 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2980 2981 return PropertyChunking( 2982 property_limit_type=property_limit_type, 2983 property_limit=model.property_limit, 2984 record_merge_strategy=record_merge_strategy, 2985 config=config, 2986 parameters=model.parameters or {}, 2987 ) 2988 2989 def create_query_properties( 2990 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 2991 ) -> QueryProperties: 2992 if isinstance(model.property_list, list): 2993 property_list = model.property_list 2994 else: 2995 property_list = self._create_component_from_model( 2996 model=model.property_list, config=config, **kwargs 2997 ) 2998 2999 property_chunking = ( 3000 self._create_component_from_model( 3001 model=model.property_chunking, config=config, **kwargs 3002 ) 3003 if model.property_chunking 3004 else None 3005 ) 3006 3007 property_selector = ( 3008 self._create_component_from_model( 3009 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3010 ) 3011 if model.property_selector 3012 else None 3013 ) 3014 3015 return QueryProperties( 3016 property_list=property_list, 3017 always_include_properties=model.always_include_properties, 3018 property_chunking=property_chunking, 3019 property_selector=property_selector, 3020 config=config, 3021 parameters=model.parameters or {}, 3022 ) 3023 3024 def create_json_schema_property_selector( 3025 self, 3026 model: JsonSchemaPropertySelectorModel, 3027 config: Config, 3028 *, 3029 stream_name: str, 3030 **kwargs: Any, 3031 ) -> JsonSchemaPropertySelector: 3032 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3033 3034 transformations = [] 3035 if model.transformations: 3036 for transformation_model in model.transformations: 3037 transformations.append( 3038 self._create_component_from_model(model=transformation_model, config=config) 3039 ) 3040 3041 return JsonSchemaPropertySelector( 3042 configured_stream=configured_stream, 3043 properties_transformations=transformations, 3044 config=config, 3045 parameters=model.parameters or {}, 3046 ) 3047 3048 @staticmethod 3049 def create_record_filter( 3050 model: RecordFilterModel, config: Config, **kwargs: Any 3051 ) -> RecordFilter: 3052 return RecordFilter( 3053 condition=model.condition or "", config=config, parameters=model.parameters or {} 3054 ) 3055 3056 @staticmethod 3057 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3058 return RequestPath(parameters={}) 3059 3060 @staticmethod 3061 def create_request_option( 3062 model: RequestOptionModel, config: Config, **kwargs: Any 3063 ) -> RequestOption: 3064 inject_into = RequestOptionType(model.inject_into.value) 3065 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3066 [ 3067 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3068 for segment in model.field_path 3069 ] 3070 if model.field_path 3071 else None 3072 ) 3073 field_name = ( 3074 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3075 if model.field_name 3076 else None 3077 ) 3078 return RequestOption( 3079 field_name=field_name, 3080 field_path=field_path, 3081 inject_into=inject_into, 3082 parameters=kwargs.get("parameters", {}), 3083 ) 3084 3085 def create_record_selector( 3086 self, 3087 model: RecordSelectorModel, 3088 config: Config, 3089 *, 3090 name: str, 3091 transformations: List[RecordTransformation] | None = None, 3092 decoder: Decoder | None = None, 3093 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3094 file_uploader: Optional[DefaultFileUploader] = None, 3095 **kwargs: Any, 3096 ) -> RecordSelector: 3097 extractor = self._create_component_from_model( 3098 model=model.extractor, decoder=decoder, config=config 3099 ) 3100 record_filter = ( 3101 self._create_component_from_model(model.record_filter, config=config) 3102 if model.record_filter 3103 else None 3104 ) 3105 3106 transform_before_filtering = ( 3107 False if model.transform_before_filtering is None else model.transform_before_filtering 3108 ) 3109 if client_side_incremental_sync_cursor: 3110 record_filter = ClientSideIncrementalRecordFilterDecorator( 3111 config=config, 3112 parameters=model.parameters, 3113 condition=model.record_filter.condition 3114 if (model.record_filter and hasattr(model.record_filter, "condition")) 3115 else None, 3116 cursor=client_side_incremental_sync_cursor, 3117 ) 3118 transform_before_filtering = ( 3119 True 3120 if model.transform_before_filtering is None 3121 else model.transform_before_filtering 3122 ) 3123 3124 if model.schema_normalization is None: 3125 # default to no schema normalization if not set 3126 model.schema_normalization = SchemaNormalizationModel.None_ 3127 3128 schema_normalization = ( 3129 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3130 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3131 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3132 ) 3133 3134 return RecordSelector( 3135 extractor=extractor, 3136 name=name, 3137 config=config, 3138 record_filter=record_filter, 3139 transformations=transformations or [], 3140 file_uploader=file_uploader, 3141 schema_normalization=schema_normalization, 3142 parameters=model.parameters or {}, 3143 transform_before_filtering=transform_before_filtering, 3144 ) 3145 3146 @staticmethod 3147 def create_remove_fields( 3148 model: RemoveFieldsModel, config: Config, **kwargs: Any 3149 ) -> RemoveFields: 3150 return RemoveFields( 3151 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3152 ) 3153 3154 def create_selective_authenticator( 3155 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3156 ) -> DeclarativeAuthenticator: 3157 authenticators = { 3158 name: self._create_component_from_model(model=auth, config=config) 3159 for name, auth in model.authenticators.items() 3160 } 3161 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3162 return SelectiveAuthenticator( # type: ignore[abstract] 3163 config=config, 3164 authenticators=authenticators, 3165 authenticator_selection_path=model.authenticator_selection_path, 3166 **kwargs, 3167 ) 3168 3169 @staticmethod 3170 def create_legacy_session_token_authenticator( 3171 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3172 ) -> LegacySessionTokenAuthenticator: 3173 return LegacySessionTokenAuthenticator( 3174 api_url=url_base, 3175 header=model.header, 3176 login_url=model.login_url, 3177 password=model.password or "", 3178 session_token=model.session_token or "", 3179 session_token_response_key=model.session_token_response_key or "", 3180 username=model.username or "", 3181 validate_session_url=model.validate_session_url, 3182 config=config, 3183 parameters=model.parameters or {}, 3184 ) 3185 3186 def create_simple_retriever( 3187 self, 3188 model: SimpleRetrieverModel, 3189 config: Config, 3190 *, 3191 name: str, 3192 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3193 request_options_provider: Optional[RequestOptionsProvider] = None, 3194 cursor: Optional[Cursor] = None, 3195 has_stop_condition_cursor: bool = False, 3196 is_client_side_incremental_sync: bool = False, 3197 transformations: List[RecordTransformation], 3198 file_uploader: Optional[DefaultFileUploader] = None, 3199 incremental_sync: Optional[ 3200 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3201 ] = None, 3202 use_cache: Optional[bool] = None, 3203 log_formatter: Optional[Callable[[Response], Any]] = None, 3204 partition_router: Optional[PartitionRouter] = None, 3205 **kwargs: Any, 3206 ) -> SimpleRetriever: 3207 def _get_url(req: Requester) -> str: 3208 """ 3209 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3210 This is needed because the URL is not set until the requester is created. 3211 """ 3212 3213 _url: str = ( 3214 model.requester.url 3215 if hasattr(model.requester, "url") and model.requester.url is not None 3216 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3217 ) 3218 _url_base: str = ( 3219 model.requester.url_base 3220 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3221 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3222 ) 3223 3224 return _url or _url_base 3225 3226 if cursor is None: 3227 cursor = FinalStateCursor(name, None, self._message_repository) 3228 3229 decoder = ( 3230 self._create_component_from_model(model=model.decoder, config=config) 3231 if model.decoder 3232 else JsonDecoder(parameters={}) 3233 ) 3234 record_selector = self._create_component_from_model( 3235 model=model.record_selector, 3236 name=name, 3237 config=config, 3238 decoder=decoder, 3239 transformations=transformations, 3240 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3241 file_uploader=file_uploader, 3242 ) 3243 3244 query_properties: Optional[QueryProperties] = None 3245 query_properties_key: Optional[str] = None 3246 self._ensure_query_properties_to_model(model.requester) 3247 if self._has_query_properties_in_request_parameters(model.requester): 3248 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3249 # places instead of default to request_parameters which isn't clearly documented 3250 if ( 3251 hasattr(model.requester, "fetch_properties_from_endpoint") 3252 and model.requester.fetch_properties_from_endpoint 3253 ): 3254 raise ValueError( 3255 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3256 ) 3257 3258 query_properties_definitions = [] 3259 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3260 if isinstance(request_parameter, QueryPropertiesModel): 3261 query_properties_key = key 3262 query_properties_definitions.append(request_parameter) 3263 3264 if len(query_properties_definitions) > 1: 3265 raise ValueError( 3266 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3267 ) 3268 3269 if len(query_properties_definitions) == 1: 3270 query_properties = self._create_component_from_model( 3271 model=query_properties_definitions[0], stream_name=name, config=config 3272 ) 3273 3274 # Removes QueryProperties components from the interpolated mappings because it has been designed 3275 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3276 # instead of through jinja interpolation 3277 if hasattr(model.requester, "request_parameters") and isinstance( 3278 model.requester.request_parameters, Mapping 3279 ): 3280 model.requester.request_parameters = self._remove_query_properties( 3281 model.requester.request_parameters 3282 ) 3283 elif ( 3284 hasattr(model.requester, "fetch_properties_from_endpoint") 3285 and model.requester.fetch_properties_from_endpoint 3286 ): 3287 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3288 query_properties_definition = QueryPropertiesModel( 3289 type="QueryProperties", 3290 property_list=model.requester.fetch_properties_from_endpoint, 3291 always_include_properties=None, 3292 property_chunking=None, 3293 ) # type: ignore # $parameters has a default value 3294 3295 query_properties = self.create_query_properties( 3296 model=query_properties_definition, 3297 stream_name=name, 3298 config=config, 3299 ) 3300 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3301 query_properties = self.create_query_properties( 3302 model=model.requester.query_properties, 3303 stream_name=name, 3304 config=config, 3305 ) 3306 3307 requester = self._create_component_from_model( 3308 model=model.requester, 3309 decoder=decoder, 3310 name=name, 3311 query_properties_key=query_properties_key, 3312 use_cache=use_cache, 3313 config=config, 3314 ) 3315 3316 if not request_options_provider: 3317 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3318 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3319 partition_router, PartitionRouter 3320 ): 3321 request_options_provider = partition_router 3322 3323 paginator = ( 3324 self._create_component_from_model( 3325 model=model.paginator, 3326 config=config, 3327 url_base=_get_url(requester), 3328 extractor_model=model.record_selector.extractor, 3329 decoder=decoder, 3330 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3331 ) 3332 if model.paginator 3333 else NoPagination(parameters={}) 3334 ) 3335 3336 ignore_stream_slicer_parameters_on_paginated_requests = ( 3337 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3338 ) 3339 3340 if ( 3341 model.partition_router 3342 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3343 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3344 and any( 3345 parent_stream_config.lazy_read_pointer 3346 for parent_stream_config in model.partition_router.parent_stream_configs 3347 ) 3348 ): 3349 if incremental_sync: 3350 if incremental_sync.type != "DatetimeBasedCursor": 3351 raise ValueError( 3352 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3353 ) 3354 3355 elif incremental_sync.step or incremental_sync.cursor_granularity: 3356 raise ValueError( 3357 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3358 ) 3359 3360 if model.decoder and model.decoder.type != "JsonDecoder": 3361 raise ValueError( 3362 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3363 ) 3364 3365 return LazySimpleRetriever( 3366 name=name, 3367 paginator=paginator, 3368 primary_key=primary_key, 3369 requester=requester, 3370 record_selector=record_selector, 3371 stream_slicer=_NO_STREAM_SLICING, 3372 request_option_provider=request_options_provider, 3373 config=config, 3374 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3375 parameters=model.parameters or {}, 3376 ) 3377 3378 if ( 3379 model.record_selector.record_filter 3380 and model.pagination_reset 3381 and model.pagination_reset.limits 3382 ): 3383 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3384 3385 return SimpleRetriever( 3386 name=name, 3387 paginator=paginator, 3388 primary_key=primary_key, 3389 requester=requester, 3390 record_selector=record_selector, 3391 stream_slicer=_NO_STREAM_SLICING, 3392 request_option_provider=request_options_provider, 3393 config=config, 3394 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3395 additional_query_properties=query_properties, 3396 log_formatter=self._get_log_formatter(log_formatter, name), 3397 pagination_tracker_factory=self._create_pagination_tracker_factory( 3398 model.pagination_reset, cursor 3399 ), 3400 parameters=model.parameters or {}, 3401 ) 3402 3403 def _create_pagination_tracker_factory( 3404 self, model: Optional[PaginationResetModel], cursor: Cursor 3405 ) -> Callable[[], PaginationTracker]: 3406 if model is None: 3407 return lambda: PaginationTracker() 3408 3409 # Until we figure out a way to use any cursor for PaginationTracker, we will have to have this cursor selector logic 3410 cursor_factory: Callable[[], Optional[ConcurrentCursor]] = lambda: None 3411 if model.action == PaginationResetActionModel.RESET: 3412 # in that case, we will let cursor_factory to return None even if the stream has a cursor 3413 pass 3414 elif model.action == PaginationResetActionModel.SPLIT_USING_CURSOR: 3415 if isinstance(cursor, ConcurrentCursor): 3416 cursor_factory = lambda: cursor.copy_without_state() # type: ignore # the if condition validates that it is a ConcurrentCursor 3417 elif isinstance(cursor, ConcurrentPerPartitionCursor): 3418 cursor_factory = lambda: cursor._cursor_factory.create( # type: ignore # if this becomes a problem, we would need to extract the cursor_factory instantiation logic and make it accessible here 3419 {}, datetime.timedelta(0) 3420 ) 3421 elif not isinstance(cursor, FinalStateCursor): 3422 LOGGER.warning( 3423 "Unknown cursor for PaginationTracker. Pagination resets might not work properly" 3424 ) 3425 else: 3426 raise ValueError(f"Unknown PaginationReset action: {model.action}") 3427 3428 limit = model.limits.number_of_records if model and model.limits else None 3429 return lambda: PaginationTracker(cursor_factory(), limit) 3430 3431 def _get_log_formatter( 3432 self, log_formatter: Callable[[Response], Any] | None, name: str 3433 ) -> Callable[[Response], Any] | None: 3434 if self._should_limit_slices_fetched(): 3435 return ( 3436 ( 3437 lambda response: format_http_message( 3438 response, 3439 f"Stream '{name}' request", 3440 f"Request performed in order to extract records for stream '{name}'", 3441 name, 3442 ) 3443 ) 3444 if not log_formatter 3445 else log_formatter 3446 ) 3447 return None 3448 3449 def _should_limit_slices_fetched(self) -> bool: 3450 """ 3451 Returns True if the number of slices fetched should be limited, False otherwise. 3452 This is used to limit the number of slices fetched during tests. 3453 """ 3454 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3455 3456 @staticmethod 3457 def _has_query_properties_in_request_parameters( 3458 requester: Union[HttpRequesterModel, CustomRequesterModel], 3459 ) -> bool: 3460 if not hasattr(requester, "request_parameters"): 3461 return False 3462 request_parameters = requester.request_parameters 3463 if request_parameters and isinstance(request_parameters, Mapping): 3464 for request_parameter in request_parameters.values(): 3465 if isinstance(request_parameter, QueryPropertiesModel): 3466 return True 3467 return False 3468 3469 @staticmethod 3470 def _remove_query_properties( 3471 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3472 ) -> Mapping[str, str]: 3473 return { 3474 parameter_field: request_parameter 3475 for parameter_field, request_parameter in request_parameters.items() 3476 if not isinstance(request_parameter, QueryPropertiesModel) 3477 } 3478 3479 def create_state_delegating_stream( 3480 self, 3481 model: StateDelegatingStreamModel, 3482 config: Config, 3483 has_parent_state: Optional[bool] = None, 3484 **kwargs: Any, 3485 ) -> DefaultStream: 3486 if ( 3487 model.full_refresh_stream.name != model.name 3488 or model.name != model.incremental_stream.name 3489 ): 3490 raise ValueError( 3491 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3492 ) 3493 3494 stream_model = self._get_state_delegating_stream_model( 3495 False if has_parent_state is None else has_parent_state, model 3496 ) 3497 3498 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # DeclarativeStream will be created as stream_model is alwyas DeclarativeStreamModel 3499 3500 def _get_state_delegating_stream_model( 3501 self, has_parent_state: bool, model: StateDelegatingStreamModel 3502 ) -> DeclarativeStreamModel: 3503 return ( 3504 model.incremental_stream 3505 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3506 else model.full_refresh_stream 3507 ) 3508 3509 def _create_async_job_status_mapping( 3510 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3511 ) -> Mapping[str, AsyncJobStatus]: 3512 api_status_to_cdk_status = {} 3513 for cdk_status, api_statuses in model.dict().items(): 3514 if cdk_status == "type": 3515 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3516 continue 3517 3518 for status in api_statuses: 3519 if status in api_status_to_cdk_status: 3520 raise ValueError( 3521 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3522 ) 3523 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3524 return api_status_to_cdk_status 3525 3526 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3527 match status: 3528 case "running": 3529 return AsyncJobStatus.RUNNING 3530 case "completed": 3531 return AsyncJobStatus.COMPLETED 3532 case "failed": 3533 return AsyncJobStatus.FAILED 3534 case "timeout": 3535 return AsyncJobStatus.TIMED_OUT 3536 case _: 3537 raise ValueError(f"Unsupported CDK status {status}") 3538 3539 def create_async_retriever( 3540 self, 3541 model: AsyncRetrieverModel, 3542 config: Config, 3543 *, 3544 name: str, 3545 primary_key: Optional[ 3546 Union[str, List[str], List[List[str]]] 3547 ], # this seems to be needed to match create_simple_retriever 3548 stream_slicer: Optional[StreamSlicer], 3549 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3550 transformations: List[RecordTransformation], 3551 **kwargs: Any, 3552 ) -> AsyncRetriever: 3553 if model.download_target_requester and not model.download_target_extractor: 3554 raise ValueError( 3555 f"`download_target_extractor` required if using a `download_target_requester`" 3556 ) 3557 3558 def _get_download_retriever( 3559 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3560 ) -> SimpleRetriever: 3561 # We create a record selector for the download retriever 3562 # with no schema normalization and no transformations, neither record filter 3563 # as all this occurs in the record_selector of the AsyncRetriever 3564 record_selector = RecordSelector( 3565 extractor=extractor, 3566 name=name, 3567 record_filter=None, 3568 transformations=[], 3569 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3570 config=config, 3571 parameters={}, 3572 ) 3573 paginator = ( 3574 self._create_component_from_model( 3575 model=model.download_paginator, 3576 decoder=_decoder, 3577 config=config, 3578 url_base="", 3579 ) 3580 if model.download_paginator 3581 else NoPagination(parameters={}) 3582 ) 3583 3584 return SimpleRetriever( 3585 requester=requester, 3586 record_selector=record_selector, 3587 primary_key=None, 3588 name=name, 3589 paginator=paginator, 3590 config=config, 3591 parameters={}, 3592 log_formatter=self._get_log_formatter(None, name), 3593 ) 3594 3595 def _get_job_timeout() -> datetime.timedelta: 3596 user_defined_timeout: Optional[int] = ( 3597 int( 3598 InterpolatedString.create( 3599 str(model.polling_job_timeout), 3600 parameters={}, 3601 ).eval(config) 3602 ) 3603 if model.polling_job_timeout 3604 else None 3605 ) 3606 3607 # check for user defined timeout during the test read or 15 minutes 3608 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3609 # default value for non-connector builder is 60 minutes. 3610 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3611 3612 return ( 3613 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3614 ) 3615 3616 decoder = ( 3617 self._create_component_from_model(model=model.decoder, config=config) 3618 if model.decoder 3619 else JsonDecoder(parameters={}) 3620 ) 3621 record_selector = self._create_component_from_model( 3622 model=model.record_selector, 3623 config=config, 3624 decoder=decoder, 3625 name=name, 3626 transformations=transformations, 3627 client_side_incremental_sync=client_side_incremental_sync, 3628 ) 3629 3630 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3631 if self._should_limit_slices_fetched(): 3632 stream_slicer = cast( 3633 StreamSlicer, 3634 StreamSlicerTestReadDecorator( 3635 wrapped_slicer=stream_slicer, 3636 maximum_number_of_slices=self._limit_slices_fetched or 5, 3637 ), 3638 ) 3639 3640 creation_requester = self._create_component_from_model( 3641 model=model.creation_requester, 3642 decoder=decoder, 3643 config=config, 3644 name=f"job creation - {name}", 3645 ) 3646 polling_requester = self._create_component_from_model( 3647 model=model.polling_requester, 3648 decoder=decoder, 3649 config=config, 3650 name=f"job polling - {name}", 3651 ) 3652 job_download_components_name = f"job download - {name}" 3653 download_decoder = ( 3654 self._create_component_from_model(model=model.download_decoder, config=config) 3655 if model.download_decoder 3656 else JsonDecoder(parameters={}) 3657 ) 3658 download_extractor = ( 3659 self._create_component_from_model( 3660 model=model.download_extractor, 3661 config=config, 3662 decoder=download_decoder, 3663 parameters=model.parameters, 3664 ) 3665 if model.download_extractor 3666 else DpathExtractor( 3667 [], 3668 config=config, 3669 decoder=download_decoder, 3670 parameters=model.parameters or {}, 3671 ) 3672 ) 3673 download_requester = self._create_component_from_model( 3674 model=model.download_requester, 3675 decoder=download_decoder, 3676 config=config, 3677 name=job_download_components_name, 3678 ) 3679 download_retriever = _get_download_retriever( 3680 download_requester, download_extractor, download_decoder 3681 ) 3682 abort_requester = ( 3683 self._create_component_from_model( 3684 model=model.abort_requester, 3685 decoder=decoder, 3686 config=config, 3687 name=f"job abort - {name}", 3688 ) 3689 if model.abort_requester 3690 else None 3691 ) 3692 delete_requester = ( 3693 self._create_component_from_model( 3694 model=model.delete_requester, 3695 decoder=decoder, 3696 config=config, 3697 name=f"job delete - {name}", 3698 ) 3699 if model.delete_requester 3700 else None 3701 ) 3702 download_target_requester = ( 3703 self._create_component_from_model( 3704 model=model.download_target_requester, 3705 decoder=decoder, 3706 config=config, 3707 name=f"job extract_url - {name}", 3708 ) 3709 if model.download_target_requester 3710 else None 3711 ) 3712 status_extractor = self._create_component_from_model( 3713 model=model.status_extractor, decoder=decoder, config=config, name=name 3714 ) 3715 download_target_extractor = ( 3716 self._create_component_from_model( 3717 model=model.download_target_extractor, 3718 decoder=decoder, 3719 config=config, 3720 name=name, 3721 ) 3722 if model.download_target_extractor 3723 else None 3724 ) 3725 3726 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3727 creation_requester=creation_requester, 3728 polling_requester=polling_requester, 3729 download_retriever=download_retriever, 3730 download_target_requester=download_target_requester, 3731 abort_requester=abort_requester, 3732 delete_requester=delete_requester, 3733 status_extractor=status_extractor, 3734 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3735 download_target_extractor=download_target_extractor, 3736 job_timeout=_get_job_timeout(), 3737 ) 3738 3739 async_job_partition_router = AsyncJobPartitionRouter( 3740 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3741 job_repository, 3742 stream_slices, 3743 self._job_tracker, 3744 self._message_repository, 3745 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3746 has_bulk_parent=False, 3747 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3748 # `None` == default retry is set to 3 attempts, under the hood. 3749 job_max_retry=1 if self._emit_connector_builder_messages else None, 3750 ), 3751 stream_slicer=stream_slicer, 3752 config=config, 3753 parameters=model.parameters or {}, 3754 ) 3755 3756 return AsyncRetriever( 3757 record_selector=record_selector, 3758 stream_slicer=async_job_partition_router, 3759 config=config, 3760 parameters=model.parameters or {}, 3761 ) 3762 3763 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3764 config_migrations = [ 3765 self._create_component_from_model(migration, config) 3766 for migration in ( 3767 model.config_normalization_rules.config_migrations 3768 if ( 3769 model.config_normalization_rules 3770 and model.config_normalization_rules.config_migrations 3771 ) 3772 else [] 3773 ) 3774 ] 3775 config_transformations = [ 3776 self._create_component_from_model(transformation, config) 3777 for transformation in ( 3778 model.config_normalization_rules.transformations 3779 if ( 3780 model.config_normalization_rules 3781 and model.config_normalization_rules.transformations 3782 ) 3783 else [] 3784 ) 3785 ] 3786 config_validations = [ 3787 self._create_component_from_model(validation, config) 3788 for validation in ( 3789 model.config_normalization_rules.validations 3790 if ( 3791 model.config_normalization_rules 3792 and model.config_normalization_rules.validations 3793 ) 3794 else [] 3795 ) 3796 ] 3797 3798 return Spec( 3799 connection_specification=model.connection_specification, 3800 documentation_url=model.documentation_url, 3801 advanced_auth=model.advanced_auth, 3802 parameters={}, 3803 config_migrations=config_migrations, 3804 config_transformations=config_transformations, 3805 config_validations=config_validations, 3806 ) 3807 3808 def create_substream_partition_router( 3809 self, 3810 model: SubstreamPartitionRouterModel, 3811 config: Config, 3812 *, 3813 stream_name: str, 3814 **kwargs: Any, 3815 ) -> SubstreamPartitionRouter: 3816 parent_stream_configs = [] 3817 if model.parent_stream_configs: 3818 parent_stream_configs.extend( 3819 [ 3820 self.create_parent_stream_config_with_substream_wrapper( 3821 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 3822 ) 3823 for parent_stream_config in model.parent_stream_configs 3824 ] 3825 ) 3826 3827 return SubstreamPartitionRouter( 3828 parent_stream_configs=parent_stream_configs, 3829 parameters=model.parameters or {}, 3830 config=config, 3831 ) 3832 3833 def create_parent_stream_config_with_substream_wrapper( 3834 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3835 ) -> Any: 3836 # getting the parent state 3837 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 3838 3839 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3840 has_parent_state = bool( 3841 self._connector_state_manager.get_stream_state(stream_name, None) 3842 if model.incremental_dependency 3843 else False 3844 ) 3845 connector_state_manager = self._instantiate_parent_stream_state_manager( 3846 child_state, config, model, has_parent_state 3847 ) 3848 3849 substream_factory = ModelToComponentFactory( 3850 connector_state_manager=connector_state_manager, 3851 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3852 limit_slices_fetched=self._limit_slices_fetched, 3853 emit_connector_builder_messages=self._emit_connector_builder_messages, 3854 disable_retries=self._disable_retries, 3855 disable_cache=self._disable_cache, 3856 message_repository=StateFilteringMessageRepository( 3857 LogAppenderMessageRepositoryDecorator( 3858 { 3859 "airbyte_cdk": {"stream": {"is_substream": True}}, 3860 "http": {"is_auxiliary": True}, 3861 }, 3862 self._message_repository, 3863 self._evaluate_log_level(self._emit_connector_builder_messages), 3864 ), 3865 ), 3866 api_budget=self._api_budget, 3867 ) 3868 3869 return substream_factory.create_parent_stream_config( 3870 model=model, config=config, stream_name=stream_name, **kwargs 3871 ) 3872 3873 def _instantiate_parent_stream_state_manager( 3874 self, 3875 child_state: MutableMapping[str, Any], 3876 config: Config, 3877 model: ParentStreamConfigModel, 3878 has_parent_state: bool, 3879 ) -> ConnectorStateManager: 3880 """ 3881 With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the 3882 `set_initial_state` flow that existed for the declarative cursors. This state is taken from 3883 self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account 3884 for the MessageRepository being different). So we need to pass a ConnectorStateManager to the 3885 ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if 3886 incremental_dependency is set. 3887 """ 3888 if model.incremental_dependency and child_state: 3889 parent_stream_name = model.stream.name or "" 3890 parent_state = ConcurrentPerPartitionCursor.get_parent_state( 3891 child_state, parent_stream_name 3892 ) 3893 3894 if not parent_state: 3895 # there are two migration cases: state value from child stream or from global state 3896 parent_state = ConcurrentPerPartitionCursor.get_global_state( 3897 child_state, parent_stream_name 3898 ) 3899 3900 if not parent_state and not isinstance(parent_state, dict): 3901 cursor_values = child_state.values() 3902 if cursor_values and len(cursor_values) == 1: 3903 # We assume the child state is a pair `{<cursor_field>: <cursor_value>}` and we will use the 3904 # cursor value as a parent state. 3905 incremental_sync_model: Union[ 3906 DatetimeBasedCursorModel, 3907 IncrementingCountCursorModel, 3908 ] = ( 3909 model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream 3910 if isinstance(model.stream, DeclarativeStreamModel) 3911 else self._get_state_delegating_stream_model( 3912 has_parent_state, model.stream 3913 ).incremental_sync 3914 ) 3915 cursor_field = InterpolatedString.create( 3916 incremental_sync_model.cursor_field, 3917 parameters=incremental_sync_model.parameters or {}, 3918 ).eval(config) 3919 parent_state = AirbyteStateMessage( 3920 type=AirbyteStateType.STREAM, 3921 stream=AirbyteStreamState( 3922 stream_descriptor=StreamDescriptor( 3923 name=parent_stream_name, namespace=None 3924 ), 3925 stream_state=AirbyteStateBlob( 3926 {cursor_field: list(cursor_values)[0]} 3927 ), 3928 ), 3929 ) 3930 return ConnectorStateManager([parent_state] if parent_state else []) 3931 3932 return ConnectorStateManager([]) 3933 3934 @staticmethod 3935 def create_wait_time_from_header( 3936 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3937 ) -> WaitTimeFromHeaderBackoffStrategy: 3938 return WaitTimeFromHeaderBackoffStrategy( 3939 header=model.header, 3940 parameters=model.parameters or {}, 3941 config=config, 3942 regex=model.regex, 3943 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3944 if model.max_waiting_time_in_seconds is not None 3945 else None, 3946 ) 3947 3948 @staticmethod 3949 def create_wait_until_time_from_header( 3950 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3951 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3952 return WaitUntilTimeFromHeaderBackoffStrategy( 3953 header=model.header, 3954 parameters=model.parameters or {}, 3955 config=config, 3956 min_wait=model.min_wait, 3957 regex=model.regex, 3958 ) 3959 3960 def get_message_repository(self) -> MessageRepository: 3961 return self._message_repository 3962 3963 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3964 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3965 3966 @staticmethod 3967 def create_components_mapping_definition( 3968 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3969 ) -> ComponentMappingDefinition: 3970 interpolated_value = InterpolatedString.create( 3971 model.value, parameters=model.parameters or {} 3972 ) 3973 field_path = [ 3974 InterpolatedString.create(path, parameters=model.parameters or {}) 3975 for path in model.field_path 3976 ] 3977 return ComponentMappingDefinition( 3978 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3979 value=interpolated_value, 3980 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3981 create_or_update=model.create_or_update, 3982 condition=model.condition, 3983 parameters=model.parameters or {}, 3984 ) 3985 3986 def create_http_components_resolver( 3987 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 3988 ) -> Any: 3989 retriever = self._create_component_from_model( 3990 model=model.retriever, 3991 config=config, 3992 name=f"{stream_name if stream_name else '__http_components_resolver'}", 3993 primary_key=None, 3994 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 3995 transformations=[], 3996 ) 3997 3998 components_mapping = [] 3999 for component_mapping_definition_model in model.components_mapping: 4000 if component_mapping_definition_model.condition: 4001 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4002 components_mapping.append( 4003 self._create_component_from_model( 4004 model=component_mapping_definition_model, 4005 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4006 component_mapping_definition_model.value_type 4007 ), 4008 config=config, 4009 ) 4010 ) 4011 4012 return HttpComponentsResolver( 4013 retriever=retriever, 4014 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4015 config=config, 4016 components_mapping=components_mapping, 4017 parameters=model.parameters or {}, 4018 ) 4019 4020 @staticmethod 4021 def create_stream_config( 4022 model: StreamConfigModel, config: Config, **kwargs: Any 4023 ) -> StreamConfig: 4024 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4025 [x for x in model.configs_pointer] if model.configs_pointer else [] 4026 ) 4027 4028 return StreamConfig( 4029 configs_pointer=model_configs_pointer, 4030 default_values=model.default_values, 4031 parameters=model.parameters or {}, 4032 ) 4033 4034 def create_config_components_resolver( 4035 self, 4036 model: ConfigComponentsResolverModel, 4037 config: Config, 4038 ) -> Any: 4039 model_stream_configs = ( 4040 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4041 ) 4042 4043 stream_configs = [ 4044 self._create_component_from_model( 4045 stream_config, config=config, parameters=model.parameters or {} 4046 ) 4047 for stream_config in model_stream_configs 4048 ] 4049 4050 components_mapping = [ 4051 self._create_component_from_model( 4052 model=components_mapping_definition_model, 4053 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4054 components_mapping_definition_model.value_type 4055 ), 4056 config=config, 4057 parameters=model.parameters, 4058 ) 4059 for components_mapping_definition_model in model.components_mapping 4060 ] 4061 4062 return ConfigComponentsResolver( 4063 stream_configs=stream_configs, 4064 config=config, 4065 components_mapping=components_mapping, 4066 parameters=model.parameters or {}, 4067 ) 4068 4069 def create_parametrized_components_resolver( 4070 self, 4071 model: ParametrizedComponentsResolverModel, 4072 config: Config, 4073 ) -> ParametrizedComponentsResolver: 4074 stream_parameters = StreamParametersDefinition( 4075 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4076 ) 4077 4078 components_mapping = [] 4079 for components_mapping_definition_model in model.components_mapping: 4080 if components_mapping_definition_model.condition: 4081 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4082 components_mapping.append( 4083 self._create_component_from_model( 4084 model=components_mapping_definition_model, 4085 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4086 components_mapping_definition_model.value_type 4087 ), 4088 config=config, 4089 ) 4090 ) 4091 return ParametrizedComponentsResolver( 4092 stream_parameters=stream_parameters, 4093 config=config, 4094 components_mapping=components_mapping, 4095 parameters=model.parameters or {}, 4096 ) 4097 4098 _UNSUPPORTED_DECODER_ERROR = ( 4099 "Specified decoder of {decoder_type} is not supported for pagination." 4100 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 4101 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 4102 ) 4103 4104 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 4105 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 4106 return True 4107 elif isinstance(decoder, CompositeRawDecoder): 4108 return self._is_supported_parser_for_pagination(decoder.parser) 4109 else: 4110 return False 4111 4112 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 4113 if isinstance(parser, JsonParser): 4114 return True 4115 elif isinstance(parser, GzipParser): 4116 return isinstance(parser.inner_parser, JsonParser) 4117 else: 4118 return False 4119 4120 def create_http_api_budget( 4121 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4122 ) -> HttpAPIBudget: 4123 policies = [ 4124 self._create_component_from_model(model=policy, config=config) 4125 for policy in model.policies 4126 ] 4127 4128 return HttpAPIBudget( 4129 policies=policies, 4130 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4131 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4132 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4133 ) 4134 4135 def create_fixed_window_call_rate_policy( 4136 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4137 ) -> FixedWindowCallRatePolicy: 4138 matchers = [ 4139 self._create_component_from_model(model=matcher, config=config) 4140 for matcher in model.matchers 4141 ] 4142 4143 # Set the initial reset timestamp to 10 days from now. 4144 # This value will be updated by the first request. 4145 return FixedWindowCallRatePolicy( 4146 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4147 period=parse_duration(model.period), 4148 call_limit=model.call_limit, 4149 matchers=matchers, 4150 ) 4151 4152 def create_file_uploader( 4153 self, model: FileUploaderModel, config: Config, **kwargs: Any 4154 ) -> FileUploader: 4155 name = "File Uploader" 4156 requester = self._create_component_from_model( 4157 model=model.requester, 4158 config=config, 4159 name=name, 4160 **kwargs, 4161 ) 4162 download_target_extractor = self._create_component_from_model( 4163 model=model.download_target_extractor, 4164 config=config, 4165 name=name, 4166 **kwargs, 4167 ) 4168 emit_connector_builder_messages = self._emit_connector_builder_messages 4169 file_uploader = DefaultFileUploader( 4170 requester=requester, 4171 download_target_extractor=download_target_extractor, 4172 config=config, 4173 file_writer=NoopFileWriter() 4174 if emit_connector_builder_messages 4175 else LocalFileSystemFileWriter(), 4176 parameters=model.parameters or {}, 4177 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4178 ) 4179 4180 return ( 4181 ConnectorBuilderFileUploader(file_uploader) 4182 if emit_connector_builder_messages 4183 else file_uploader 4184 ) 4185 4186 def create_moving_window_call_rate_policy( 4187 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4188 ) -> MovingWindowCallRatePolicy: 4189 rates = [ 4190 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4191 ] 4192 matchers = [ 4193 self._create_component_from_model(model=matcher, config=config) 4194 for matcher in model.matchers 4195 ] 4196 return MovingWindowCallRatePolicy( 4197 rates=rates, 4198 matchers=matchers, 4199 ) 4200 4201 def create_unlimited_call_rate_policy( 4202 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4203 ) -> UnlimitedCallRatePolicy: 4204 matchers = [ 4205 self._create_component_from_model(model=matcher, config=config) 4206 for matcher in model.matchers 4207 ] 4208 4209 return UnlimitedCallRatePolicy( 4210 matchers=matchers, 4211 ) 4212 4213 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4214 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4215 return Rate( 4216 limit=int(interpolated_limit.eval(config=config)), 4217 interval=parse_duration(model.interval), 4218 ) 4219 4220 def create_http_request_matcher( 4221 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4222 ) -> HttpRequestRegexMatcher: 4223 return HttpRequestRegexMatcher( 4224 method=model.method, 4225 url_base=model.url_base, 4226 url_path_pattern=model.url_path_pattern, 4227 params=model.params, 4228 headers=model.headers, 4229 ) 4230 4231 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4232 self._api_budget = self.create_component( 4233 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4234 ) 4235 4236 def create_grouping_partition_router( 4237 self, 4238 model: GroupingPartitionRouterModel, 4239 config: Config, 4240 *, 4241 stream_name: str, 4242 **kwargs: Any, 4243 ) -> GroupingPartitionRouter: 4244 underlying_router = self._create_component_from_model( 4245 model=model.underlying_partition_router, 4246 config=config, 4247 stream_name=stream_name, 4248 **kwargs, 4249 ) 4250 if model.group_size < 1: 4251 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4252 4253 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4254 # because they are specific to individual partitions and cannot be aggregated or handled 4255 # when grouping, potentially leading to incorrect API calls. Any request customization 4256 # should be managed at the stream level through the requester's configuration. 4257 if isinstance(underlying_router, SubstreamPartitionRouter): 4258 if any( 4259 parent_config.request_option 4260 for parent_config in underlying_router.parent_stream_configs 4261 ): 4262 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4263 4264 if isinstance(underlying_router, ListPartitionRouter): 4265 if underlying_router.request_option: 4266 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4267 4268 return GroupingPartitionRouter( 4269 group_size=model.group_size, 4270 underlying_partition_router=underlying_router, 4271 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4272 config=config, 4273 ) 4274 4275 def _ensure_query_properties_to_model( 4276 self, requester: Union[HttpRequesterModel, CustomRequesterModel] 4277 ) -> None: 4278 """ 4279 For some reason, it seems like CustomRequesterModel request_parameters stays as dictionaries which means that 4280 the other conditions relying on it being QueryPropertiesModel instead of a dict fail. Here, we migrate them to 4281 proper model. 4282 """ 4283 if not hasattr(requester, "request_parameters"): 4284 return 4285 4286 request_parameters = requester.request_parameters 4287 if request_parameters and isinstance(request_parameters, Dict): 4288 for request_parameter_key in request_parameters.keys(): 4289 request_parameter = request_parameters[request_parameter_key] 4290 if ( 4291 isinstance(request_parameter, Dict) 4292 and request_parameter.get("type") == "QueryProperties" 4293 ): 4294 request_parameters[request_parameter_key] = QueryPropertiesModel.parse_obj( 4295 request_parameter 4296 )
674 def __init__( 675 self, 676 limit_pages_fetched_per_slice: Optional[int] = None, 677 limit_slices_fetched: Optional[int] = None, 678 emit_connector_builder_messages: bool = False, 679 disable_retries: bool = False, 680 disable_cache: bool = False, 681 message_repository: Optional[MessageRepository] = None, 682 connector_state_manager: Optional[ConnectorStateManager] = None, 683 max_concurrent_async_job_count: Optional[int] = None, 684 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 685 api_budget: Optional[APIBudget] = None, 686 ): 687 self._init_mappings() 688 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 689 self._limit_slices_fetched = limit_slices_fetched 690 self._emit_connector_builder_messages = emit_connector_builder_messages 691 self._disable_retries = disable_retries 692 self._disable_cache = disable_cache 693 self._message_repository = message_repository or InMemoryMessageRepository( 694 self._evaluate_log_level(emit_connector_builder_messages) 695 ) 696 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 697 configured_catalog 698 ) 699 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 700 self._api_budget: Optional[Union[APIBudget]] = api_budget 701 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 702 # placeholder for deprecation warnings 703 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = []
826 def create_component( 827 self, 828 model_type: Type[BaseModel], 829 component_definition: ComponentDefinition, 830 config: Config, 831 **kwargs: Any, 832 ) -> Any: 833 """ 834 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 835 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 836 creating declarative components from that model. 837 838 :param model_type: The type of declarative component that is being initialized 839 :param component_definition: The mapping that represents a declarative component 840 :param config: The connector config that is provided by the customer 841 :return: The declarative component to be used at runtime 842 """ 843 844 component_type = component_definition.get("type") 845 if component_definition.get("type") != model_type.__name__: 846 raise ValueError( 847 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 848 ) 849 850 declarative_component_model = model_type.parse_obj(component_definition) 851 852 if not isinstance(declarative_component_model, model_type): 853 raise ValueError( 854 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 855 ) 856 857 return self._create_component_from_model( 858 model=declarative_component_model, config=config, **kwargs 859 )
Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating creating declarative components from that model.
Parameters
- model_type: The type of declarative component that is being initialized
- component_definition: The mapping that represents a declarative component
- config: The connector config that is provided by the customer
Returns
The declarative component to be used at runtime
876 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 877 """ 878 Returns the deprecation warnings that were collected during the creation of components. 879 """ 880 return self._collected_deprecation_logs
Returns the deprecation warnings that were collected during the creation of components.
897 def create_config_migration( 898 self, model: ConfigMigrationModel, config: Config 899 ) -> ConfigMigration: 900 transformations: List[ConfigTransformation] = [ 901 self._create_component_from_model(transformation, config) 902 for transformation in model.transformations 903 ] 904 905 return ConfigMigration( 906 description=model.description, 907 transformations=transformations, 908 )
910 def create_config_add_fields( 911 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 912 ) -> ConfigAddFields: 913 fields = [self._create_component_from_model(field, config) for field in model.fields] 914 return ConfigAddFields( 915 fields=fields, 916 condition=model.condition or "", 917 )
966 @staticmethod 967 def create_added_field_definition( 968 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 969 ) -> AddedFieldDefinition: 970 interpolated_value = InterpolatedString.create( 971 model.value, parameters=model.parameters or {} 972 ) 973 return AddedFieldDefinition( 974 path=model.path, 975 value=interpolated_value, 976 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 977 parameters=model.parameters or {}, 978 )
980 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 981 added_field_definitions = [ 982 self._create_component_from_model( 983 model=added_field_definition_model, 984 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 985 added_field_definition_model.value_type 986 ), 987 config=config, 988 ) 989 for added_field_definition_model in model.fields 990 ] 991 return AddFields( 992 fields=added_field_definitions, 993 condition=model.condition or "", 994 parameters=model.parameters or {}, 995 )
1021 def create_dpath_flatten_fields( 1022 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1023 ) -> DpathFlattenFields: 1024 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1025 key_transformation = ( 1026 KeyTransformation( 1027 config=config, 1028 prefix=model.key_transformation.prefix, 1029 suffix=model.key_transformation.suffix, 1030 parameters=model.parameters or {}, 1031 ) 1032 if model.key_transformation is not None 1033 else None 1034 ) 1035 return DpathFlattenFields( 1036 config=config, 1037 field_path=model_field_path, 1038 delete_origin_value=model.delete_origin_value 1039 if model.delete_origin_value is not None 1040 else False, 1041 replace_record=model.replace_record if model.replace_record is not None else False, 1042 key_transformation=key_transformation, 1043 parameters=model.parameters or {}, 1044 )
1058 def create_api_key_authenticator( 1059 self, 1060 model: ApiKeyAuthenticatorModel, 1061 config: Config, 1062 token_provider: Optional[TokenProvider] = None, 1063 **kwargs: Any, 1064 ) -> ApiKeyAuthenticator: 1065 if model.inject_into is None and model.header is None: 1066 raise ValueError( 1067 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1068 ) 1069 1070 if model.inject_into is not None and model.header is not None: 1071 raise ValueError( 1072 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1073 ) 1074 1075 if token_provider is not None and model.api_token != "": 1076 raise ValueError( 1077 "If token_provider is set, api_token is ignored and has to be set to empty string." 1078 ) 1079 1080 request_option = ( 1081 self._create_component_from_model( 1082 model.inject_into, config, parameters=model.parameters or {} 1083 ) 1084 if model.inject_into 1085 else RequestOption( 1086 inject_into=RequestOptionType.header, 1087 field_name=model.header or "", 1088 parameters=model.parameters or {}, 1089 ) 1090 ) 1091 1092 return ApiKeyAuthenticator( 1093 token_provider=( 1094 token_provider 1095 if token_provider is not None 1096 else InterpolatedStringTokenProvider( 1097 api_token=model.api_token or "", 1098 config=config, 1099 parameters=model.parameters or {}, 1100 ) 1101 ), 1102 request_option=request_option, 1103 config=config, 1104 parameters=model.parameters or {}, 1105 )
1107 def create_legacy_to_per_partition_state_migration( 1108 self, 1109 model: LegacyToPerPartitionStateMigrationModel, 1110 config: Mapping[str, Any], 1111 declarative_stream: DeclarativeStreamModel, 1112 ) -> LegacyToPerPartitionStateMigration: 1113 retriever = declarative_stream.retriever 1114 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1115 raise ValueError( 1116 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1117 ) 1118 partition_router = retriever.partition_router 1119 if not isinstance( 1120 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1121 ): 1122 raise ValueError( 1123 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1124 ) 1125 if not hasattr(partition_router, "parent_stream_configs"): 1126 raise ValueError( 1127 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1128 ) 1129 1130 if not hasattr(declarative_stream, "incremental_sync"): 1131 raise ValueError( 1132 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1133 ) 1134 1135 return LegacyToPerPartitionStateMigration( 1136 partition_router, # type: ignore # was already checked above 1137 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1138 config, 1139 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1140 )
1142 def create_session_token_authenticator( 1143 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1144 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1145 decoder = ( 1146 self._create_component_from_model(model=model.decoder, config=config) 1147 if model.decoder 1148 else JsonDecoder(parameters={}) 1149 ) 1150 login_requester = self._create_component_from_model( 1151 model=model.login_requester, 1152 config=config, 1153 name=f"{name}_login_requester", 1154 decoder=decoder, 1155 ) 1156 token_provider = SessionTokenProvider( 1157 login_requester=login_requester, 1158 session_token_path=model.session_token_path, 1159 expiration_duration=parse_duration(model.expiration_duration) 1160 if model.expiration_duration 1161 else None, 1162 parameters=model.parameters or {}, 1163 message_repository=self._message_repository, 1164 decoder=decoder, 1165 ) 1166 if model.request_authentication.type == "Bearer": 1167 return ModelToComponentFactory.create_bearer_authenticator( 1168 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1169 config, 1170 token_provider=token_provider, 1171 ) 1172 else: 1173 return self.create_api_key_authenticator( 1174 ApiKeyAuthenticatorModel( 1175 type="ApiKeyAuthenticator", 1176 api_token="", 1177 inject_into=model.request_authentication.inject_into, 1178 ), # type: ignore # $parameters and headers default to None 1179 config=config, 1180 token_provider=token_provider, 1181 )
1183 @staticmethod 1184 def create_basic_http_authenticator( 1185 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1186 ) -> BasicHttpAuthenticator: 1187 return BasicHttpAuthenticator( 1188 password=model.password or "", 1189 username=model.username, 1190 config=config, 1191 parameters=model.parameters or {}, 1192 )
1194 @staticmethod 1195 def create_bearer_authenticator( 1196 model: BearerAuthenticatorModel, 1197 config: Config, 1198 token_provider: Optional[TokenProvider] = None, 1199 **kwargs: Any, 1200 ) -> BearerAuthenticator: 1201 if token_provider is not None and model.api_token != "": 1202 raise ValueError( 1203 "If token_provider is set, api_token is ignored and has to be set to empty string." 1204 ) 1205 return BearerAuthenticator( 1206 token_provider=( 1207 token_provider 1208 if token_provider is not None 1209 else InterpolatedStringTokenProvider( 1210 api_token=model.api_token or "", 1211 config=config, 1212 parameters=model.parameters or {}, 1213 ) 1214 ), 1215 config=config, 1216 parameters=model.parameters or {}, 1217 )
1219 @staticmethod 1220 def create_dynamic_stream_check_config( 1221 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1222 ) -> DynamicStreamCheckConfig: 1223 return DynamicStreamCheckConfig( 1224 dynamic_stream_name=model.dynamic_stream_name, 1225 stream_count=model.stream_count or 0, 1226 )
1228 def create_check_stream( 1229 self, model: CheckStreamModel, config: Config, **kwargs: Any 1230 ) -> CheckStream: 1231 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1232 raise ValueError( 1233 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1234 ) 1235 1236 dynamic_streams_check_configs = ( 1237 [ 1238 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1239 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1240 ] 1241 if model.dynamic_streams_check_configs 1242 else [] 1243 ) 1244 1245 return CheckStream( 1246 stream_names=model.stream_names or [], 1247 dynamic_streams_check_configs=dynamic_streams_check_configs, 1248 parameters={}, 1249 )
1251 @staticmethod 1252 def create_check_dynamic_stream( 1253 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1254 ) -> CheckDynamicStream: 1255 assert model.use_check_availability is not None # for mypy 1256 1257 use_check_availability = model.use_check_availability 1258 1259 return CheckDynamicStream( 1260 stream_count=model.stream_count, 1261 use_check_availability=use_check_availability, 1262 parameters={}, 1263 )
1265 def create_composite_error_handler( 1266 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1267 ) -> CompositeErrorHandler: 1268 error_handlers = [ 1269 self._create_component_from_model(model=error_handler_model, config=config) 1270 for error_handler_model in model.error_handlers 1271 ] 1272 return CompositeErrorHandler( 1273 error_handlers=error_handlers, parameters=model.parameters or {} 1274 )
1276 @staticmethod 1277 def create_concurrency_level( 1278 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1279 ) -> ConcurrencyLevel: 1280 return ConcurrencyLevel( 1281 default_concurrency=model.default_concurrency, 1282 max_concurrency=model.max_concurrency, 1283 config=config, 1284 parameters={}, 1285 )
1287 @staticmethod 1288 def apply_stream_state_migrations( 1289 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1290 ) -> MutableMapping[str, Any]: 1291 if stream_state_migrations: 1292 for state_migration in stream_state_migrations: 1293 if state_migration.should_migrate(stream_state): 1294 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1295 stream_state = dict(state_migration.migrate(stream_state)) 1296 return stream_state
1298 def create_concurrent_cursor_from_datetime_based_cursor( 1299 self, 1300 model_type: Type[BaseModel], 1301 component_definition: ComponentDefinition, 1302 stream_name: str, 1303 stream_namespace: Optional[str], 1304 stream_state: MutableMapping[str, Any], 1305 config: Config, 1306 message_repository: Optional[MessageRepository] = None, 1307 runtime_lookback_window: Optional[datetime.timedelta] = None, 1308 **kwargs: Any, 1309 ) -> ConcurrentCursor: 1310 component_type = component_definition.get("type") 1311 if component_definition.get("type") != model_type.__name__: 1312 raise ValueError( 1313 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1314 ) 1315 1316 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1317 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1318 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1319 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1320 if "$parameters" not in component_definition and "parameters" in component_definition: 1321 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1322 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1323 1324 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1325 raise ValueError( 1326 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1327 ) 1328 1329 model_parameters = datetime_based_cursor_model.parameters or {} 1330 interpolated_cursor_field = InterpolatedString.create( 1331 datetime_based_cursor_model.cursor_field, 1332 parameters=model_parameters, 1333 ) 1334 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1335 1336 interpolated_partition_field_start = InterpolatedString.create( 1337 datetime_based_cursor_model.partition_field_start or "start_time", 1338 parameters=model_parameters, 1339 ) 1340 interpolated_partition_field_end = InterpolatedString.create( 1341 datetime_based_cursor_model.partition_field_end or "end_time", 1342 parameters=model_parameters, 1343 ) 1344 1345 slice_boundary_fields = ( 1346 interpolated_partition_field_start.eval(config=config), 1347 interpolated_partition_field_end.eval(config=config), 1348 ) 1349 1350 datetime_format = datetime_based_cursor_model.datetime_format 1351 1352 cursor_granularity = ( 1353 parse_duration(datetime_based_cursor_model.cursor_granularity) 1354 if datetime_based_cursor_model.cursor_granularity 1355 else None 1356 ) 1357 1358 lookback_window = None 1359 interpolated_lookback_window = ( 1360 InterpolatedString.create( 1361 datetime_based_cursor_model.lookback_window, 1362 parameters=model_parameters, 1363 ) 1364 if datetime_based_cursor_model.lookback_window 1365 else None 1366 ) 1367 if interpolated_lookback_window: 1368 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1369 if evaluated_lookback_window: 1370 lookback_window = parse_duration(evaluated_lookback_window) 1371 1372 connector_state_converter: DateTimeStreamStateConverter 1373 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1374 datetime_format=datetime_format, 1375 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1376 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1377 cursor_granularity=cursor_granularity, 1378 ) 1379 1380 # Adjusts the stream state by applying the runtime lookback window. 1381 # This is used to ensure correct state handling in case of failed partitions. 1382 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1383 if runtime_lookback_window and stream_state_value: 1384 new_stream_state = ( 1385 connector_state_converter.parse_timestamp(stream_state_value) 1386 - runtime_lookback_window 1387 ) 1388 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1389 new_stream_state 1390 ) 1391 1392 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1393 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1394 start_date_runtime_value = self.create_min_max_datetime( 1395 model=datetime_based_cursor_model.start_datetime, config=config 1396 ) 1397 else: 1398 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1399 1400 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1401 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1402 end_date_runtime_value = self.create_min_max_datetime( 1403 model=datetime_based_cursor_model.end_datetime, config=config 1404 ) 1405 else: 1406 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1407 1408 interpolated_start_date = MinMaxDatetime.create( 1409 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1410 parameters=datetime_based_cursor_model.parameters, 1411 ) 1412 interpolated_end_date = ( 1413 None 1414 if not end_date_runtime_value 1415 else MinMaxDatetime.create( 1416 end_date_runtime_value, datetime_based_cursor_model.parameters 1417 ) 1418 ) 1419 1420 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1421 if not interpolated_start_date.datetime_format: 1422 interpolated_start_date.datetime_format = datetime_format 1423 if interpolated_end_date and not interpolated_end_date.datetime_format: 1424 interpolated_end_date.datetime_format = datetime_format 1425 1426 start_date = interpolated_start_date.get_datetime(config=config) 1427 end_date_provider = ( 1428 partial(interpolated_end_date.get_datetime, config) 1429 if interpolated_end_date 1430 else connector_state_converter.get_end_provider() 1431 ) 1432 1433 if ( 1434 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1435 ) or ( 1436 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1437 ): 1438 raise ValueError( 1439 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1440 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1441 ) 1442 1443 # When step is not defined, default to a step size from the starting date to the present moment 1444 step_length = datetime.timedelta.max 1445 interpolated_step = ( 1446 InterpolatedString.create( 1447 datetime_based_cursor_model.step, 1448 parameters=model_parameters, 1449 ) 1450 if datetime_based_cursor_model.step 1451 else None 1452 ) 1453 if interpolated_step: 1454 evaluated_step = interpolated_step.eval(config) 1455 if evaluated_step: 1456 step_length = parse_duration(evaluated_step) 1457 1458 clamping_strategy: ClampingStrategy = NoClamping() 1459 if datetime_based_cursor_model.clamping: 1460 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1461 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1462 # object which we want to keep agnostic of being low-code 1463 target = InterpolatedString( 1464 string=datetime_based_cursor_model.clamping.target, 1465 parameters=model_parameters, 1466 ) 1467 evaluated_target = target.eval(config=config) 1468 match evaluated_target: 1469 case "DAY": 1470 clamping_strategy = DayClampingStrategy() 1471 end_date_provider = ClampingEndProvider( 1472 DayClampingStrategy(is_ceiling=False), 1473 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1474 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1475 ) 1476 case "WEEK": 1477 if ( 1478 not datetime_based_cursor_model.clamping.target_details 1479 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1480 ): 1481 raise ValueError( 1482 "Given WEEK clamping, weekday needs to be provided as target_details" 1483 ) 1484 weekday = self._assemble_weekday( 1485 datetime_based_cursor_model.clamping.target_details["weekday"] 1486 ) 1487 clamping_strategy = WeekClampingStrategy(weekday) 1488 end_date_provider = ClampingEndProvider( 1489 WeekClampingStrategy(weekday, is_ceiling=False), 1490 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1491 granularity=cursor_granularity or datetime.timedelta(days=1), 1492 ) 1493 case "MONTH": 1494 clamping_strategy = MonthClampingStrategy() 1495 end_date_provider = ClampingEndProvider( 1496 MonthClampingStrategy(is_ceiling=False), 1497 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1498 granularity=cursor_granularity or datetime.timedelta(days=1), 1499 ) 1500 case _: 1501 raise ValueError( 1502 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1503 ) 1504 1505 return ConcurrentCursor( 1506 stream_name=stream_name, 1507 stream_namespace=stream_namespace, 1508 stream_state=stream_state, 1509 message_repository=message_repository or self._message_repository, 1510 connector_state_manager=self._connector_state_manager, 1511 connector_state_converter=connector_state_converter, 1512 cursor_field=cursor_field, 1513 slice_boundary_fields=slice_boundary_fields, 1514 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1515 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1516 lookback_window=lookback_window, 1517 slice_range=step_length, 1518 cursor_granularity=cursor_granularity, 1519 clamping_strategy=clamping_strategy, 1520 )
1522 def create_concurrent_cursor_from_incrementing_count_cursor( 1523 self, 1524 model_type: Type[BaseModel], 1525 component_definition: ComponentDefinition, 1526 stream_name: str, 1527 stream_namespace: Optional[str], 1528 stream_state: MutableMapping[str, Any], 1529 config: Config, 1530 message_repository: Optional[MessageRepository] = None, 1531 **kwargs: Any, 1532 ) -> ConcurrentCursor: 1533 component_type = component_definition.get("type") 1534 if component_definition.get("type") != model_type.__name__: 1535 raise ValueError( 1536 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1537 ) 1538 1539 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1540 1541 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1542 raise ValueError( 1543 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1544 ) 1545 1546 interpolated_start_value = ( 1547 InterpolatedString.create( 1548 incrementing_count_cursor_model.start_value, # type: ignore 1549 parameters=incrementing_count_cursor_model.parameters or {}, 1550 ) 1551 if incrementing_count_cursor_model.start_value 1552 else 0 1553 ) 1554 1555 interpolated_cursor_field = InterpolatedString.create( 1556 incrementing_count_cursor_model.cursor_field, 1557 parameters=incrementing_count_cursor_model.parameters or {}, 1558 ) 1559 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1560 1561 connector_state_converter = IncrementingCountStreamStateConverter( 1562 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1563 ) 1564 1565 return ConcurrentCursor( 1566 stream_name=stream_name, 1567 stream_namespace=stream_namespace, 1568 stream_state=stream_state, 1569 message_repository=message_repository or self._message_repository, 1570 connector_state_manager=self._connector_state_manager, 1571 connector_state_converter=connector_state_converter, 1572 cursor_field=cursor_field, 1573 slice_boundary_fields=None, 1574 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1575 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1576 )
1597 def create_concurrent_cursor_from_perpartition_cursor( 1598 self, 1599 state_manager: ConnectorStateManager, 1600 model_type: Type[BaseModel], 1601 component_definition: ComponentDefinition, 1602 stream_name: str, 1603 stream_namespace: Optional[str], 1604 config: Config, 1605 stream_state: MutableMapping[str, Any], 1606 partition_router: PartitionRouter, 1607 attempt_to_create_cursor_if_not_provided: bool = False, 1608 **kwargs: Any, 1609 ) -> ConcurrentPerPartitionCursor: 1610 component_type = component_definition.get("type") 1611 if component_definition.get("type") != model_type.__name__: 1612 raise ValueError( 1613 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1614 ) 1615 1616 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1617 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1618 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1619 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1620 if "$parameters" not in component_definition and "parameters" in component_definition: 1621 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1622 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1623 1624 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1625 raise ValueError( 1626 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1627 ) 1628 1629 interpolated_cursor_field = InterpolatedString.create( 1630 datetime_based_cursor_model.cursor_field, 1631 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1632 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1633 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1634 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1635 parameters=datetime_based_cursor_model.parameters or {}, 1636 ) 1637 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1638 1639 datetime_format = datetime_based_cursor_model.datetime_format 1640 1641 cursor_granularity = ( 1642 parse_duration(datetime_based_cursor_model.cursor_granularity) 1643 if datetime_based_cursor_model.cursor_granularity 1644 else None 1645 ) 1646 1647 connector_state_converter: DateTimeStreamStateConverter 1648 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1649 datetime_format=datetime_format, 1650 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1651 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1652 cursor_granularity=cursor_granularity, 1653 ) 1654 1655 # Create the cursor factory 1656 cursor_factory = ConcurrentCursorFactory( 1657 partial( 1658 self.create_concurrent_cursor_from_datetime_based_cursor, 1659 state_manager=state_manager, 1660 model_type=model_type, 1661 component_definition=component_definition, 1662 stream_name=stream_name, 1663 stream_namespace=stream_namespace, 1664 config=config, 1665 message_repository=NoopMessageRepository(), 1666 ) 1667 ) 1668 1669 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1670 use_global_cursor = isinstance( 1671 partition_router, GroupingPartitionRouter 1672 ) or component_definition.get("global_substream_cursor", False) 1673 1674 # Return the concurrent cursor and state converter 1675 return ConcurrentPerPartitionCursor( 1676 cursor_factory=cursor_factory, 1677 partition_router=partition_router, 1678 stream_name=stream_name, 1679 stream_namespace=stream_namespace, 1680 stream_state=stream_state, 1681 message_repository=self._message_repository, # type: ignore 1682 connector_state_manager=state_manager, 1683 connector_state_converter=connector_state_converter, 1684 cursor_field=cursor_field, 1685 use_global_cursor=use_global_cursor, 1686 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1687 )
1689 @staticmethod 1690 def create_constant_backoff_strategy( 1691 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1692 ) -> ConstantBackoffStrategy: 1693 return ConstantBackoffStrategy( 1694 backoff_time_in_seconds=model.backoff_time_in_seconds, 1695 config=config, 1696 parameters=model.parameters or {}, 1697 )
1699 def create_cursor_pagination( 1700 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1701 ) -> CursorPaginationStrategy: 1702 if isinstance(decoder, PaginationDecoderDecorator): 1703 inner_decoder = decoder.decoder 1704 else: 1705 inner_decoder = decoder 1706 decoder = PaginationDecoderDecorator(decoder=decoder) 1707 1708 if self._is_supported_decoder_for_pagination(inner_decoder): 1709 decoder_to_use = decoder 1710 else: 1711 raise ValueError( 1712 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1713 ) 1714 1715 return CursorPaginationStrategy( 1716 cursor_value=model.cursor_value, 1717 decoder=decoder_to_use, 1718 page_size=model.page_size, 1719 stop_condition=model.stop_condition, 1720 config=config, 1721 parameters=model.parameters or {}, 1722 )
1724 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1725 """ 1726 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1727 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1728 :param model: The Pydantic model of the custom component being created 1729 :param config: The custom defined connector config 1730 :return: The declarative component built from the Pydantic model to be used at runtime 1731 """ 1732 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1733 component_fields = get_type_hints(custom_component_class) 1734 model_args = model.dict() 1735 model_args["config"] = config 1736 1737 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1738 # we defer to these arguments over the component's definition 1739 for key, arg in kwargs.items(): 1740 model_args[key] = arg 1741 1742 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1743 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1744 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1745 for model_field, model_value in model_args.items(): 1746 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1747 if ( 1748 isinstance(model_value, dict) 1749 and "type" not in model_value 1750 and model_field in component_fields 1751 ): 1752 derived_type = self._derive_component_type_from_type_hints( 1753 component_fields.get(model_field) 1754 ) 1755 if derived_type: 1756 model_value["type"] = derived_type 1757 1758 if self._is_component(model_value): 1759 model_args[model_field] = self._create_nested_component( 1760 model, 1761 model_field, 1762 model_value, 1763 config, 1764 **kwargs, 1765 ) 1766 elif isinstance(model_value, list): 1767 vals = [] 1768 for v in model_value: 1769 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1770 derived_type = self._derive_component_type_from_type_hints( 1771 component_fields.get(model_field) 1772 ) 1773 if derived_type: 1774 v["type"] = derived_type 1775 if self._is_component(v): 1776 vals.append( 1777 self._create_nested_component( 1778 model, 1779 model_field, 1780 v, 1781 config, 1782 **kwargs, 1783 ) 1784 ) 1785 else: 1786 vals.append(v) 1787 model_args[model_field] = vals 1788 1789 kwargs = { 1790 class_field: model_args[class_field] 1791 for class_field in component_fields.keys() 1792 if class_field in model_args 1793 } 1794 return custom_component_class(**kwargs)
Generically creates a custom component based on the model type and a class_name reference to the custom Python class being instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor
Parameters
- model: The Pydantic model of the custom component being created
- config: The custom defined connector config
Returns
The declarative component built from the Pydantic model to be used at runtime
1929 def create_default_stream( 1930 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 1931 ) -> AbstractStream: 1932 primary_key = model.primary_key.__root__ if model.primary_key else None 1933 self._migrate_state(model, config) 1934 1935 partition_router = self._build_stream_slicer_from_partition_router( 1936 model.retriever, 1937 config, 1938 stream_name=model.name, 1939 **kwargs, 1940 ) 1941 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 1942 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1943 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 1944 1945 end_time_option = ( 1946 self._create_component_from_model( 1947 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1948 ) 1949 if cursor_model.end_time_option 1950 else None 1951 ) 1952 start_time_option = ( 1953 self._create_component_from_model( 1954 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1955 ) 1956 if cursor_model.start_time_option 1957 else None 1958 ) 1959 1960 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 1961 start_time_option=start_time_option, 1962 end_time_option=end_time_option, 1963 partition_field_start=cursor_model.partition_field_start, 1964 partition_field_end=cursor_model.partition_field_end, 1965 config=config, 1966 parameters=model.parameters or {}, 1967 ) 1968 request_options_provider = ( 1969 datetime_request_options_provider 1970 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 1971 else PerPartitionRequestOptionsProvider( 1972 partition_router, datetime_request_options_provider 1973 ) 1974 ) 1975 elif model.incremental_sync and isinstance( 1976 model.incremental_sync, IncrementingCountCursorModel 1977 ): 1978 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 1979 raise ValueError( 1980 "PerPartition does not support per partition states because switching to global state is time based" 1981 ) 1982 1983 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1984 1985 start_time_option = ( 1986 self._create_component_from_model( 1987 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1988 config, 1989 parameters=cursor_model.parameters or {}, 1990 ) 1991 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1992 else None 1993 ) 1994 1995 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1996 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1997 partition_field_start = "start" 1998 1999 request_options_provider = DatetimeBasedRequestOptionsProvider( 2000 start_time_option=start_time_option, 2001 partition_field_start=partition_field_start, 2002 config=config, 2003 parameters=model.parameters or {}, 2004 ) 2005 else: 2006 request_options_provider = None 2007 2008 transformations = [] 2009 if model.transformations: 2010 for transformation_model in model.transformations: 2011 transformations.append( 2012 self._create_component_from_model(model=transformation_model, config=config) 2013 ) 2014 file_uploader = None 2015 if model.file_uploader: 2016 file_uploader = self._create_component_from_model( 2017 model=model.file_uploader, config=config 2018 ) 2019 2020 stream_slicer: ConcurrentStreamSlicer = ( 2021 partition_router 2022 if isinstance(concurrent_cursor, FinalStateCursor) 2023 else concurrent_cursor 2024 ) 2025 2026 retriever = self._create_component_from_model( 2027 model=model.retriever, 2028 config=config, 2029 name=model.name, 2030 primary_key=primary_key, 2031 request_options_provider=request_options_provider, 2032 stream_slicer=stream_slicer, 2033 partition_router=partition_router, 2034 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2035 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2036 cursor=concurrent_cursor, 2037 transformations=transformations, 2038 file_uploader=file_uploader, 2039 incremental_sync=model.incremental_sync, 2040 ) 2041 if isinstance(retriever, AsyncRetriever): 2042 stream_slicer = retriever.stream_slicer 2043 2044 schema_loader: SchemaLoader 2045 if model.schema_loader and isinstance(model.schema_loader, list): 2046 nested_schema_loaders = [ 2047 self._create_component_from_model(model=nested_schema_loader, config=config) 2048 for nested_schema_loader in model.schema_loader 2049 ] 2050 schema_loader = CompositeSchemaLoader( 2051 schema_loaders=nested_schema_loaders, parameters={} 2052 ) 2053 elif model.schema_loader: 2054 schema_loader = self._create_component_from_model( 2055 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2056 config=config, 2057 ) 2058 else: 2059 options = model.parameters or {} 2060 if "name" not in options: 2061 options["name"] = model.name 2062 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2063 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2064 2065 stream_name = model.name or "" 2066 return DefaultStream( 2067 partition_generator=StreamSlicerPartitionGenerator( 2068 DeclarativePartitionFactory( 2069 stream_name, 2070 schema_loader, 2071 retriever, 2072 self._message_repository, 2073 ), 2074 stream_slicer, 2075 slice_limit=self._limit_slices_fetched, 2076 ), 2077 name=stream_name, 2078 json_schema=schema_loader.get_json_schema, 2079 primary_key=get_primary_key_from_stream(primary_key), 2080 cursor_field=concurrent_cursor.cursor_field.cursor_field_key 2081 if hasattr(concurrent_cursor, "cursor_field") 2082 else "", # FIXME we should have the cursor field has part of the interface of cursor, 2083 logger=logging.getLogger(f"airbyte.{stream_name}"), 2084 cursor=concurrent_cursor, 2085 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2086 )
2228 def create_default_error_handler( 2229 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2230 ) -> DefaultErrorHandler: 2231 backoff_strategies = [] 2232 if model.backoff_strategies: 2233 for backoff_strategy_model in model.backoff_strategies: 2234 backoff_strategies.append( 2235 self._create_component_from_model(model=backoff_strategy_model, config=config) 2236 ) 2237 2238 response_filters = [] 2239 if model.response_filters: 2240 for response_filter_model in model.response_filters: 2241 response_filters.append( 2242 self._create_component_from_model(model=response_filter_model, config=config) 2243 ) 2244 response_filters.append( 2245 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2246 ) 2247 2248 return DefaultErrorHandler( 2249 backoff_strategies=backoff_strategies, 2250 max_retries=model.max_retries, 2251 response_filters=response_filters, 2252 config=config, 2253 parameters=model.parameters or {}, 2254 )
2256 def create_default_paginator( 2257 self, 2258 model: DefaultPaginatorModel, 2259 config: Config, 2260 *, 2261 url_base: str, 2262 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2263 decoder: Optional[Decoder] = None, 2264 cursor_used_for_stop_condition: Optional[Cursor] = None, 2265 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2266 if decoder: 2267 if self._is_supported_decoder_for_pagination(decoder): 2268 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2269 else: 2270 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2271 else: 2272 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2273 page_size_option = ( 2274 self._create_component_from_model(model=model.page_size_option, config=config) 2275 if model.page_size_option 2276 else None 2277 ) 2278 page_token_option = ( 2279 self._create_component_from_model(model=model.page_token_option, config=config) 2280 if model.page_token_option 2281 else None 2282 ) 2283 pagination_strategy = self._create_component_from_model( 2284 model=model.pagination_strategy, 2285 config=config, 2286 decoder=decoder_to_use, 2287 extractor_model=extractor_model, 2288 ) 2289 if cursor_used_for_stop_condition: 2290 pagination_strategy = StopConditionPaginationStrategyDecorator( 2291 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2292 ) 2293 paginator = DefaultPaginator( 2294 decoder=decoder_to_use, 2295 page_size_option=page_size_option, 2296 page_token_option=page_token_option, 2297 pagination_strategy=pagination_strategy, 2298 url_base=url_base, 2299 config=config, 2300 parameters=model.parameters or {}, 2301 ) 2302 if self._limit_pages_fetched_per_slice: 2303 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2304 return paginator
2306 def create_dpath_extractor( 2307 self, 2308 model: DpathExtractorModel, 2309 config: Config, 2310 decoder: Optional[Decoder] = None, 2311 **kwargs: Any, 2312 ) -> DpathExtractor: 2313 if decoder: 2314 decoder_to_use = decoder 2315 else: 2316 decoder_to_use = JsonDecoder(parameters={}) 2317 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2318 return DpathExtractor( 2319 decoder=decoder_to_use, 2320 field_path=model_field_path, 2321 config=config, 2322 parameters=model.parameters or {}, 2323 )
2344 def create_http_requester( 2345 self, 2346 model: HttpRequesterModel, 2347 config: Config, 2348 decoder: Decoder = JsonDecoder(parameters={}), 2349 query_properties_key: Optional[str] = None, 2350 use_cache: Optional[bool] = None, 2351 *, 2352 name: str, 2353 ) -> HttpRequester: 2354 authenticator = ( 2355 self._create_component_from_model( 2356 model=model.authenticator, 2357 config=config, 2358 url_base=model.url or model.url_base, 2359 name=name, 2360 decoder=decoder, 2361 ) 2362 if model.authenticator 2363 else None 2364 ) 2365 error_handler = ( 2366 self._create_component_from_model(model=model.error_handler, config=config) 2367 if model.error_handler 2368 else DefaultErrorHandler( 2369 backoff_strategies=[], 2370 response_filters=[], 2371 config=config, 2372 parameters=model.parameters or {}, 2373 ) 2374 ) 2375 2376 api_budget = self._api_budget 2377 2378 request_options_provider = InterpolatedRequestOptionsProvider( 2379 request_body=model.request_body, 2380 request_body_data=model.request_body_data, 2381 request_body_json=model.request_body_json, 2382 request_headers=model.request_headers, 2383 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2384 query_properties_key=query_properties_key, 2385 config=config, 2386 parameters=model.parameters or {}, 2387 ) 2388 2389 assert model.use_cache is not None # for mypy 2390 assert model.http_method is not None # for mypy 2391 2392 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2393 2394 return HttpRequester( 2395 name=name, 2396 url=model.url, 2397 url_base=model.url_base, 2398 path=model.path, 2399 authenticator=authenticator, 2400 error_handler=error_handler, 2401 api_budget=api_budget, 2402 http_method=HttpMethod[model.http_method.value], 2403 request_options_provider=request_options_provider, 2404 config=config, 2405 disable_retries=self._disable_retries, 2406 parameters=model.parameters or {}, 2407 message_repository=self._message_repository, 2408 use_cache=should_use_cache, 2409 decoder=decoder, 2410 stream_response=decoder.is_stream_response() if decoder else False, 2411 )
2413 @staticmethod 2414 def create_http_response_filter( 2415 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2416 ) -> HttpResponseFilter: 2417 if model.action: 2418 action = ResponseAction(model.action.value) 2419 else: 2420 action = None 2421 2422 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2423 2424 http_codes = ( 2425 set(model.http_codes) if model.http_codes else set() 2426 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2427 2428 return HttpResponseFilter( 2429 action=action, 2430 failure_type=failure_type, 2431 error_message=model.error_message or "", 2432 error_message_contains=model.error_message_contains or "", 2433 http_codes=http_codes, 2434 predicate=model.predicate or "", 2435 config=config, 2436 parameters=model.parameters or {}, 2437 )
2445 def create_complex_field_type( 2446 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2447 ) -> ComplexFieldType: 2448 items = ( 2449 self._create_component_from_model(model=model.items, config=config) 2450 if isinstance(model.items, ComplexFieldTypeModel) 2451 else model.items 2452 ) 2453 2454 return ComplexFieldType(field_type=model.field_type, items=items)
2456 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2457 target_type = ( 2458 self._create_component_from_model(model=model.target_type, config=config) 2459 if isinstance(model.target_type, ComplexFieldTypeModel) 2460 else model.target_type 2461 ) 2462 2463 return TypesMap( 2464 target_type=target_type, 2465 current_type=model.current_type, 2466 condition=model.condition if model.condition is not None else "True", 2467 )
2469 def create_schema_type_identifier( 2470 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2471 ) -> SchemaTypeIdentifier: 2472 types_mapping = [] 2473 if model.types_mapping: 2474 types_mapping.extend( 2475 [ 2476 self._create_component_from_model(types_map, config=config) 2477 for types_map in model.types_mapping 2478 ] 2479 ) 2480 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2481 [x for x in model.schema_pointer] if model.schema_pointer else [] 2482 ) 2483 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2484 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2485 [x for x in model.type_pointer] if model.type_pointer else None 2486 ) 2487 2488 return SchemaTypeIdentifier( 2489 schema_pointer=model_schema_pointer, 2490 key_pointer=model_key_pointer, 2491 type_pointer=model_type_pointer, 2492 types_mapping=types_mapping, 2493 parameters=model.parameters or {}, 2494 )
2496 def create_dynamic_schema_loader( 2497 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2498 ) -> DynamicSchemaLoader: 2499 schema_transformations = [] 2500 if model.schema_transformations: 2501 for transformation_model in model.schema_transformations: 2502 schema_transformations.append( 2503 self._create_component_from_model(model=transformation_model, config=config) 2504 ) 2505 name = "dynamic_properties" 2506 retriever = self._create_component_from_model( 2507 model=model.retriever, 2508 config=config, 2509 name=name, 2510 primary_key=None, 2511 partition_router=self._build_stream_slicer_from_partition_router( 2512 model.retriever, config 2513 ), 2514 transformations=[], 2515 use_cache=True, 2516 log_formatter=( 2517 lambda response: format_http_message( 2518 response, 2519 f"Schema loader '{name}' request", 2520 f"Request performed in order to extract schema.", 2521 name, 2522 is_auxiliary=True, 2523 ) 2524 ), 2525 ) 2526 schema_type_identifier = self._create_component_from_model( 2527 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2528 ) 2529 schema_filter = ( 2530 self._create_component_from_model( 2531 model.schema_filter, config=config, parameters=model.parameters or {} 2532 ) 2533 if model.schema_filter is not None 2534 else None 2535 ) 2536 2537 return DynamicSchemaLoader( 2538 retriever=retriever, 2539 config=config, 2540 schema_transformations=schema_transformations, 2541 schema_filter=schema_filter, 2542 schema_type_identifier=schema_type_identifier, 2543 parameters=model.parameters or {}, 2544 )
2564 def create_gzip_decoder( 2565 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2566 ) -> Decoder: 2567 _compressed_response_types = { 2568 "gzip", 2569 "x-gzip", 2570 "gzip, deflate", 2571 "x-gzip, deflate", 2572 "application/zip", 2573 "application/gzip", 2574 "application/x-gzip", 2575 "application/x-zip-compressed", 2576 } 2577 2578 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2579 2580 if self._emit_connector_builder_messages: 2581 # This is very surprising but if the response is not streamed, 2582 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2583 # which uses urllib3 directly and does not uncompress the data. 2584 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2585 2586 return CompositeRawDecoder.by_headers( 2587 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2588 stream_response=True, 2589 fallback_parser=gzip_parser.inner_parser, 2590 )
2639 def create_jwt_authenticator( 2640 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2641 ) -> JwtAuthenticator: 2642 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2643 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2644 request_option = ( 2645 self._create_component_from_model(model.request_option, config) 2646 if model.request_option 2647 else None 2648 ) 2649 return JwtAuthenticator( 2650 config=config, 2651 parameters=model.parameters or {}, 2652 algorithm=JwtAlgorithm(model.algorithm.value), 2653 secret_key=model.secret_key, 2654 base64_encode_secret_key=model.base64_encode_secret_key, 2655 token_duration=model.token_duration, 2656 header_prefix=model.header_prefix, 2657 kid=jwt_headers.kid, 2658 typ=jwt_headers.typ, 2659 cty=jwt_headers.cty, 2660 iss=jwt_payload.iss, 2661 sub=jwt_payload.sub, 2662 aud=jwt_payload.aud, 2663 additional_jwt_headers=model.additional_jwt_headers, 2664 additional_jwt_payload=model.additional_jwt_payload, 2665 passphrase=model.passphrase, 2666 request_option=request_option, 2667 )
2669 def create_list_partition_router( 2670 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2671 ) -> ListPartitionRouter: 2672 request_option = ( 2673 self._create_component_from_model(model.request_option, config) 2674 if model.request_option 2675 else None 2676 ) 2677 return ListPartitionRouter( 2678 cursor_field=model.cursor_field, 2679 request_option=request_option, 2680 values=model.values, 2681 config=config, 2682 parameters=model.parameters or {}, 2683 )
2685 @staticmethod 2686 def create_min_max_datetime( 2687 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2688 ) -> MinMaxDatetime: 2689 return MinMaxDatetime( 2690 datetime=model.datetime, 2691 datetime_format=model.datetime_format or "", 2692 max_datetime=model.max_datetime or "", 2693 min_datetime=model.min_datetime or "", 2694 parameters=model.parameters or {}, 2695 )
2707 def create_oauth_authenticator( 2708 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2709 ) -> DeclarativeOauth2Authenticator: 2710 profile_assertion = ( 2711 self._create_component_from_model(model.profile_assertion, config=config) 2712 if model.profile_assertion 2713 else None 2714 ) 2715 2716 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2717 self._get_refresh_token_error_information(model) 2718 ) 2719 if model.refresh_token_updater: 2720 # ignore type error because fixing it would have a lot of dependencies, revisit later 2721 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2722 config, 2723 InterpolatedString.create( 2724 model.token_refresh_endpoint, # type: ignore 2725 parameters=model.parameters or {}, 2726 ).eval(config), 2727 access_token_name=InterpolatedString.create( 2728 model.access_token_name or "access_token", parameters=model.parameters or {} 2729 ).eval(config), 2730 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2731 expires_in_name=InterpolatedString.create( 2732 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2733 ).eval(config), 2734 client_id_name=InterpolatedString.create( 2735 model.client_id_name or "client_id", parameters=model.parameters or {} 2736 ).eval(config), 2737 client_id=InterpolatedString.create( 2738 model.client_id, parameters=model.parameters or {} 2739 ).eval(config) 2740 if model.client_id 2741 else model.client_id, 2742 client_secret_name=InterpolatedString.create( 2743 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2744 ).eval(config), 2745 client_secret=InterpolatedString.create( 2746 model.client_secret, parameters=model.parameters or {} 2747 ).eval(config) 2748 if model.client_secret 2749 else model.client_secret, 2750 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2751 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2752 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2753 grant_type_name=InterpolatedString.create( 2754 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2755 ).eval(config), 2756 grant_type=InterpolatedString.create( 2757 model.grant_type or "refresh_token", parameters=model.parameters or {} 2758 ).eval(config), 2759 refresh_request_body=InterpolatedMapping( 2760 model.refresh_request_body or {}, parameters=model.parameters or {} 2761 ).eval(config), 2762 refresh_request_headers=InterpolatedMapping( 2763 model.refresh_request_headers or {}, parameters=model.parameters or {} 2764 ).eval(config), 2765 scopes=model.scopes, 2766 token_expiry_date_format=model.token_expiry_date_format, 2767 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2768 message_repository=self._message_repository, 2769 refresh_token_error_status_codes=refresh_token_error_status_codes, 2770 refresh_token_error_key=refresh_token_error_key, 2771 refresh_token_error_values=refresh_token_error_values, 2772 ) 2773 # ignore type error because fixing it would have a lot of dependencies, revisit later 2774 return DeclarativeOauth2Authenticator( # type: ignore 2775 access_token_name=model.access_token_name or "access_token", 2776 access_token_value=model.access_token_value, 2777 client_id_name=model.client_id_name or "client_id", 2778 client_id=model.client_id, 2779 client_secret_name=model.client_secret_name or "client_secret", 2780 client_secret=model.client_secret, 2781 expires_in_name=model.expires_in_name or "expires_in", 2782 grant_type_name=model.grant_type_name or "grant_type", 2783 grant_type=model.grant_type or "refresh_token", 2784 refresh_request_body=model.refresh_request_body, 2785 refresh_request_headers=model.refresh_request_headers, 2786 refresh_token_name=model.refresh_token_name or "refresh_token", 2787 refresh_token=model.refresh_token, 2788 scopes=model.scopes, 2789 token_expiry_date=model.token_expiry_date, 2790 token_expiry_date_format=model.token_expiry_date_format, 2791 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2792 token_refresh_endpoint=model.token_refresh_endpoint, 2793 config=config, 2794 parameters=model.parameters or {}, 2795 message_repository=self._message_repository, 2796 profile_assertion=profile_assertion, 2797 use_profile_assertion=model.use_profile_assertion, 2798 refresh_token_error_status_codes=refresh_token_error_status_codes, 2799 refresh_token_error_key=refresh_token_error_key, 2800 refresh_token_error_values=refresh_token_error_values, 2801 )
2851 def create_offset_increment( 2852 self, 2853 model: OffsetIncrementModel, 2854 config: Config, 2855 decoder: Decoder, 2856 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2857 **kwargs: Any, 2858 ) -> OffsetIncrement: 2859 if isinstance(decoder, PaginationDecoderDecorator): 2860 inner_decoder = decoder.decoder 2861 else: 2862 inner_decoder = decoder 2863 decoder = PaginationDecoderDecorator(decoder=decoder) 2864 2865 if self._is_supported_decoder_for_pagination(inner_decoder): 2866 decoder_to_use = decoder 2867 else: 2868 raise ValueError( 2869 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2870 ) 2871 2872 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2873 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2874 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2875 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2876 # When we have more time to investigate we can look into reusing the same component. 2877 extractor = ( 2878 self._create_component_from_model( 2879 model=extractor_model, config=config, decoder=decoder_to_use 2880 ) 2881 if extractor_model 2882 else None 2883 ) 2884 2885 return OffsetIncrement( 2886 page_size=model.page_size, 2887 config=config, 2888 decoder=decoder_to_use, 2889 extractor=extractor, 2890 inject_on_first_request=model.inject_on_first_request or False, 2891 parameters=model.parameters or {}, 2892 )
2894 @staticmethod 2895 def create_page_increment( 2896 model: PageIncrementModel, config: Config, **kwargs: Any 2897 ) -> PageIncrement: 2898 return PageIncrement( 2899 page_size=model.page_size, 2900 config=config, 2901 start_from_page=model.start_from_page or 0, 2902 inject_on_first_request=model.inject_on_first_request or False, 2903 parameters=model.parameters or {}, 2904 )
2906 def create_parent_stream_config( 2907 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 2908 ) -> ParentStreamConfig: 2909 declarative_stream = self._create_component_from_model( 2910 model.stream, 2911 config=config, 2912 is_parent=True, 2913 **kwargs, 2914 ) 2915 request_option = ( 2916 self._create_component_from_model(model.request_option, config=config) 2917 if model.request_option 2918 else None 2919 ) 2920 2921 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2922 raise ValueError( 2923 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2924 ) 2925 2926 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2927 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2928 ) 2929 2930 return ParentStreamConfig( 2931 parent_key=model.parent_key, 2932 request_option=request_option, 2933 stream=declarative_stream, 2934 partition_field=model.partition_field, 2935 config=config, 2936 incremental_dependency=model.incremental_dependency or False, 2937 parameters=model.parameters or {}, 2938 extra_fields=model.extra_fields, 2939 lazy_read_pointer=model_lazy_read_pointer, 2940 )
2942 def create_properties_from_endpoint( 2943 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2944 ) -> PropertiesFromEndpoint: 2945 retriever = self._create_component_from_model( 2946 model=model.retriever, 2947 config=config, 2948 name="dynamic_properties", 2949 primary_key=None, 2950 stream_slicer=None, 2951 transformations=[], 2952 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2953 ) 2954 return PropertiesFromEndpoint( 2955 property_field_path=model.property_field_path, 2956 retriever=retriever, 2957 config=config, 2958 parameters=model.parameters or {}, 2959 )
2961 def create_property_chunking( 2962 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2963 ) -> PropertyChunking: 2964 record_merge_strategy = ( 2965 self._create_component_from_model( 2966 model=model.record_merge_strategy, config=config, **kwargs 2967 ) 2968 if model.record_merge_strategy 2969 else None 2970 ) 2971 2972 property_limit_type: PropertyLimitType 2973 match model.property_limit_type: 2974 case PropertyLimitTypeModel.property_count: 2975 property_limit_type = PropertyLimitType.property_count 2976 case PropertyLimitTypeModel.characters: 2977 property_limit_type = PropertyLimitType.characters 2978 case _: 2979 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2980 2981 return PropertyChunking( 2982 property_limit_type=property_limit_type, 2983 property_limit=model.property_limit, 2984 record_merge_strategy=record_merge_strategy, 2985 config=config, 2986 parameters=model.parameters or {}, 2987 )
2989 def create_query_properties( 2990 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 2991 ) -> QueryProperties: 2992 if isinstance(model.property_list, list): 2993 property_list = model.property_list 2994 else: 2995 property_list = self._create_component_from_model( 2996 model=model.property_list, config=config, **kwargs 2997 ) 2998 2999 property_chunking = ( 3000 self._create_component_from_model( 3001 model=model.property_chunking, config=config, **kwargs 3002 ) 3003 if model.property_chunking 3004 else None 3005 ) 3006 3007 property_selector = ( 3008 self._create_component_from_model( 3009 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3010 ) 3011 if model.property_selector 3012 else None 3013 ) 3014 3015 return QueryProperties( 3016 property_list=property_list, 3017 always_include_properties=model.always_include_properties, 3018 property_chunking=property_chunking, 3019 property_selector=property_selector, 3020 config=config, 3021 parameters=model.parameters or {}, 3022 )
3024 def create_json_schema_property_selector( 3025 self, 3026 model: JsonSchemaPropertySelectorModel, 3027 config: Config, 3028 *, 3029 stream_name: str, 3030 **kwargs: Any, 3031 ) -> JsonSchemaPropertySelector: 3032 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3033 3034 transformations = [] 3035 if model.transformations: 3036 for transformation_model in model.transformations: 3037 transformations.append( 3038 self._create_component_from_model(model=transformation_model, config=config) 3039 ) 3040 3041 return JsonSchemaPropertySelector( 3042 configured_stream=configured_stream, 3043 properties_transformations=transformations, 3044 config=config, 3045 parameters=model.parameters or {}, 3046 )
3060 @staticmethod 3061 def create_request_option( 3062 model: RequestOptionModel, config: Config, **kwargs: Any 3063 ) -> RequestOption: 3064 inject_into = RequestOptionType(model.inject_into.value) 3065 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3066 [ 3067 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3068 for segment in model.field_path 3069 ] 3070 if model.field_path 3071 else None 3072 ) 3073 field_name = ( 3074 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3075 if model.field_name 3076 else None 3077 ) 3078 return RequestOption( 3079 field_name=field_name, 3080 field_path=field_path, 3081 inject_into=inject_into, 3082 parameters=kwargs.get("parameters", {}), 3083 )
3085 def create_record_selector( 3086 self, 3087 model: RecordSelectorModel, 3088 config: Config, 3089 *, 3090 name: str, 3091 transformations: List[RecordTransformation] | None = None, 3092 decoder: Decoder | None = None, 3093 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3094 file_uploader: Optional[DefaultFileUploader] = None, 3095 **kwargs: Any, 3096 ) -> RecordSelector: 3097 extractor = self._create_component_from_model( 3098 model=model.extractor, decoder=decoder, config=config 3099 ) 3100 record_filter = ( 3101 self._create_component_from_model(model.record_filter, config=config) 3102 if model.record_filter 3103 else None 3104 ) 3105 3106 transform_before_filtering = ( 3107 False if model.transform_before_filtering is None else model.transform_before_filtering 3108 ) 3109 if client_side_incremental_sync_cursor: 3110 record_filter = ClientSideIncrementalRecordFilterDecorator( 3111 config=config, 3112 parameters=model.parameters, 3113 condition=model.record_filter.condition 3114 if (model.record_filter and hasattr(model.record_filter, "condition")) 3115 else None, 3116 cursor=client_side_incremental_sync_cursor, 3117 ) 3118 transform_before_filtering = ( 3119 True 3120 if model.transform_before_filtering is None 3121 else model.transform_before_filtering 3122 ) 3123 3124 if model.schema_normalization is None: 3125 # default to no schema normalization if not set 3126 model.schema_normalization = SchemaNormalizationModel.None_ 3127 3128 schema_normalization = ( 3129 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3130 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3131 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3132 ) 3133 3134 return RecordSelector( 3135 extractor=extractor, 3136 name=name, 3137 config=config, 3138 record_filter=record_filter, 3139 transformations=transformations or [], 3140 file_uploader=file_uploader, 3141 schema_normalization=schema_normalization, 3142 parameters=model.parameters or {}, 3143 transform_before_filtering=transform_before_filtering, 3144 )
3154 def create_selective_authenticator( 3155 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3156 ) -> DeclarativeAuthenticator: 3157 authenticators = { 3158 name: self._create_component_from_model(model=auth, config=config) 3159 for name, auth in model.authenticators.items() 3160 } 3161 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3162 return SelectiveAuthenticator( # type: ignore[abstract] 3163 config=config, 3164 authenticators=authenticators, 3165 authenticator_selection_path=model.authenticator_selection_path, 3166 **kwargs, 3167 )
3169 @staticmethod 3170 def create_legacy_session_token_authenticator( 3171 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3172 ) -> LegacySessionTokenAuthenticator: 3173 return LegacySessionTokenAuthenticator( 3174 api_url=url_base, 3175 header=model.header, 3176 login_url=model.login_url, 3177 password=model.password or "", 3178 session_token=model.session_token or "", 3179 session_token_response_key=model.session_token_response_key or "", 3180 username=model.username or "", 3181 validate_session_url=model.validate_session_url, 3182 config=config, 3183 parameters=model.parameters or {}, 3184 )
3186 def create_simple_retriever( 3187 self, 3188 model: SimpleRetrieverModel, 3189 config: Config, 3190 *, 3191 name: str, 3192 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3193 request_options_provider: Optional[RequestOptionsProvider] = None, 3194 cursor: Optional[Cursor] = None, 3195 has_stop_condition_cursor: bool = False, 3196 is_client_side_incremental_sync: bool = False, 3197 transformations: List[RecordTransformation], 3198 file_uploader: Optional[DefaultFileUploader] = None, 3199 incremental_sync: Optional[ 3200 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3201 ] = None, 3202 use_cache: Optional[bool] = None, 3203 log_formatter: Optional[Callable[[Response], Any]] = None, 3204 partition_router: Optional[PartitionRouter] = None, 3205 **kwargs: Any, 3206 ) -> SimpleRetriever: 3207 def _get_url(req: Requester) -> str: 3208 """ 3209 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3210 This is needed because the URL is not set until the requester is created. 3211 """ 3212 3213 _url: str = ( 3214 model.requester.url 3215 if hasattr(model.requester, "url") and model.requester.url is not None 3216 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3217 ) 3218 _url_base: str = ( 3219 model.requester.url_base 3220 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3221 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3222 ) 3223 3224 return _url or _url_base 3225 3226 if cursor is None: 3227 cursor = FinalStateCursor(name, None, self._message_repository) 3228 3229 decoder = ( 3230 self._create_component_from_model(model=model.decoder, config=config) 3231 if model.decoder 3232 else JsonDecoder(parameters={}) 3233 ) 3234 record_selector = self._create_component_from_model( 3235 model=model.record_selector, 3236 name=name, 3237 config=config, 3238 decoder=decoder, 3239 transformations=transformations, 3240 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3241 file_uploader=file_uploader, 3242 ) 3243 3244 query_properties: Optional[QueryProperties] = None 3245 query_properties_key: Optional[str] = None 3246 self._ensure_query_properties_to_model(model.requester) 3247 if self._has_query_properties_in_request_parameters(model.requester): 3248 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3249 # places instead of default to request_parameters which isn't clearly documented 3250 if ( 3251 hasattr(model.requester, "fetch_properties_from_endpoint") 3252 and model.requester.fetch_properties_from_endpoint 3253 ): 3254 raise ValueError( 3255 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3256 ) 3257 3258 query_properties_definitions = [] 3259 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3260 if isinstance(request_parameter, QueryPropertiesModel): 3261 query_properties_key = key 3262 query_properties_definitions.append(request_parameter) 3263 3264 if len(query_properties_definitions) > 1: 3265 raise ValueError( 3266 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3267 ) 3268 3269 if len(query_properties_definitions) == 1: 3270 query_properties = self._create_component_from_model( 3271 model=query_properties_definitions[0], stream_name=name, config=config 3272 ) 3273 3274 # Removes QueryProperties components from the interpolated mappings because it has been designed 3275 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3276 # instead of through jinja interpolation 3277 if hasattr(model.requester, "request_parameters") and isinstance( 3278 model.requester.request_parameters, Mapping 3279 ): 3280 model.requester.request_parameters = self._remove_query_properties( 3281 model.requester.request_parameters 3282 ) 3283 elif ( 3284 hasattr(model.requester, "fetch_properties_from_endpoint") 3285 and model.requester.fetch_properties_from_endpoint 3286 ): 3287 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3288 query_properties_definition = QueryPropertiesModel( 3289 type="QueryProperties", 3290 property_list=model.requester.fetch_properties_from_endpoint, 3291 always_include_properties=None, 3292 property_chunking=None, 3293 ) # type: ignore # $parameters has a default value 3294 3295 query_properties = self.create_query_properties( 3296 model=query_properties_definition, 3297 stream_name=name, 3298 config=config, 3299 ) 3300 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3301 query_properties = self.create_query_properties( 3302 model=model.requester.query_properties, 3303 stream_name=name, 3304 config=config, 3305 ) 3306 3307 requester = self._create_component_from_model( 3308 model=model.requester, 3309 decoder=decoder, 3310 name=name, 3311 query_properties_key=query_properties_key, 3312 use_cache=use_cache, 3313 config=config, 3314 ) 3315 3316 if not request_options_provider: 3317 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3318 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3319 partition_router, PartitionRouter 3320 ): 3321 request_options_provider = partition_router 3322 3323 paginator = ( 3324 self._create_component_from_model( 3325 model=model.paginator, 3326 config=config, 3327 url_base=_get_url(requester), 3328 extractor_model=model.record_selector.extractor, 3329 decoder=decoder, 3330 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3331 ) 3332 if model.paginator 3333 else NoPagination(parameters={}) 3334 ) 3335 3336 ignore_stream_slicer_parameters_on_paginated_requests = ( 3337 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3338 ) 3339 3340 if ( 3341 model.partition_router 3342 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3343 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3344 and any( 3345 parent_stream_config.lazy_read_pointer 3346 for parent_stream_config in model.partition_router.parent_stream_configs 3347 ) 3348 ): 3349 if incremental_sync: 3350 if incremental_sync.type != "DatetimeBasedCursor": 3351 raise ValueError( 3352 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3353 ) 3354 3355 elif incremental_sync.step or incremental_sync.cursor_granularity: 3356 raise ValueError( 3357 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3358 ) 3359 3360 if model.decoder and model.decoder.type != "JsonDecoder": 3361 raise ValueError( 3362 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3363 ) 3364 3365 return LazySimpleRetriever( 3366 name=name, 3367 paginator=paginator, 3368 primary_key=primary_key, 3369 requester=requester, 3370 record_selector=record_selector, 3371 stream_slicer=_NO_STREAM_SLICING, 3372 request_option_provider=request_options_provider, 3373 config=config, 3374 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3375 parameters=model.parameters or {}, 3376 ) 3377 3378 if ( 3379 model.record_selector.record_filter 3380 and model.pagination_reset 3381 and model.pagination_reset.limits 3382 ): 3383 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3384 3385 return SimpleRetriever( 3386 name=name, 3387 paginator=paginator, 3388 primary_key=primary_key, 3389 requester=requester, 3390 record_selector=record_selector, 3391 stream_slicer=_NO_STREAM_SLICING, 3392 request_option_provider=request_options_provider, 3393 config=config, 3394 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3395 additional_query_properties=query_properties, 3396 log_formatter=self._get_log_formatter(log_formatter, name), 3397 pagination_tracker_factory=self._create_pagination_tracker_factory( 3398 model.pagination_reset, cursor 3399 ), 3400 parameters=model.parameters or {}, 3401 )
3479 def create_state_delegating_stream( 3480 self, 3481 model: StateDelegatingStreamModel, 3482 config: Config, 3483 has_parent_state: Optional[bool] = None, 3484 **kwargs: Any, 3485 ) -> DefaultStream: 3486 if ( 3487 model.full_refresh_stream.name != model.name 3488 or model.name != model.incremental_stream.name 3489 ): 3490 raise ValueError( 3491 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3492 ) 3493 3494 stream_model = self._get_state_delegating_stream_model( 3495 False if has_parent_state is None else has_parent_state, model 3496 ) 3497 3498 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # DeclarativeStream will be created as stream_model is alwyas DeclarativeStreamModel
3539 def create_async_retriever( 3540 self, 3541 model: AsyncRetrieverModel, 3542 config: Config, 3543 *, 3544 name: str, 3545 primary_key: Optional[ 3546 Union[str, List[str], List[List[str]]] 3547 ], # this seems to be needed to match create_simple_retriever 3548 stream_slicer: Optional[StreamSlicer], 3549 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3550 transformations: List[RecordTransformation], 3551 **kwargs: Any, 3552 ) -> AsyncRetriever: 3553 if model.download_target_requester and not model.download_target_extractor: 3554 raise ValueError( 3555 f"`download_target_extractor` required if using a `download_target_requester`" 3556 ) 3557 3558 def _get_download_retriever( 3559 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3560 ) -> SimpleRetriever: 3561 # We create a record selector for the download retriever 3562 # with no schema normalization and no transformations, neither record filter 3563 # as all this occurs in the record_selector of the AsyncRetriever 3564 record_selector = RecordSelector( 3565 extractor=extractor, 3566 name=name, 3567 record_filter=None, 3568 transformations=[], 3569 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3570 config=config, 3571 parameters={}, 3572 ) 3573 paginator = ( 3574 self._create_component_from_model( 3575 model=model.download_paginator, 3576 decoder=_decoder, 3577 config=config, 3578 url_base="", 3579 ) 3580 if model.download_paginator 3581 else NoPagination(parameters={}) 3582 ) 3583 3584 return SimpleRetriever( 3585 requester=requester, 3586 record_selector=record_selector, 3587 primary_key=None, 3588 name=name, 3589 paginator=paginator, 3590 config=config, 3591 parameters={}, 3592 log_formatter=self._get_log_formatter(None, name), 3593 ) 3594 3595 def _get_job_timeout() -> datetime.timedelta: 3596 user_defined_timeout: Optional[int] = ( 3597 int( 3598 InterpolatedString.create( 3599 str(model.polling_job_timeout), 3600 parameters={}, 3601 ).eval(config) 3602 ) 3603 if model.polling_job_timeout 3604 else None 3605 ) 3606 3607 # check for user defined timeout during the test read or 15 minutes 3608 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3609 # default value for non-connector builder is 60 minutes. 3610 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3611 3612 return ( 3613 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3614 ) 3615 3616 decoder = ( 3617 self._create_component_from_model(model=model.decoder, config=config) 3618 if model.decoder 3619 else JsonDecoder(parameters={}) 3620 ) 3621 record_selector = self._create_component_from_model( 3622 model=model.record_selector, 3623 config=config, 3624 decoder=decoder, 3625 name=name, 3626 transformations=transformations, 3627 client_side_incremental_sync=client_side_incremental_sync, 3628 ) 3629 3630 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3631 if self._should_limit_slices_fetched(): 3632 stream_slicer = cast( 3633 StreamSlicer, 3634 StreamSlicerTestReadDecorator( 3635 wrapped_slicer=stream_slicer, 3636 maximum_number_of_slices=self._limit_slices_fetched or 5, 3637 ), 3638 ) 3639 3640 creation_requester = self._create_component_from_model( 3641 model=model.creation_requester, 3642 decoder=decoder, 3643 config=config, 3644 name=f"job creation - {name}", 3645 ) 3646 polling_requester = self._create_component_from_model( 3647 model=model.polling_requester, 3648 decoder=decoder, 3649 config=config, 3650 name=f"job polling - {name}", 3651 ) 3652 job_download_components_name = f"job download - {name}" 3653 download_decoder = ( 3654 self._create_component_from_model(model=model.download_decoder, config=config) 3655 if model.download_decoder 3656 else JsonDecoder(parameters={}) 3657 ) 3658 download_extractor = ( 3659 self._create_component_from_model( 3660 model=model.download_extractor, 3661 config=config, 3662 decoder=download_decoder, 3663 parameters=model.parameters, 3664 ) 3665 if model.download_extractor 3666 else DpathExtractor( 3667 [], 3668 config=config, 3669 decoder=download_decoder, 3670 parameters=model.parameters or {}, 3671 ) 3672 ) 3673 download_requester = self._create_component_from_model( 3674 model=model.download_requester, 3675 decoder=download_decoder, 3676 config=config, 3677 name=job_download_components_name, 3678 ) 3679 download_retriever = _get_download_retriever( 3680 download_requester, download_extractor, download_decoder 3681 ) 3682 abort_requester = ( 3683 self._create_component_from_model( 3684 model=model.abort_requester, 3685 decoder=decoder, 3686 config=config, 3687 name=f"job abort - {name}", 3688 ) 3689 if model.abort_requester 3690 else None 3691 ) 3692 delete_requester = ( 3693 self._create_component_from_model( 3694 model=model.delete_requester, 3695 decoder=decoder, 3696 config=config, 3697 name=f"job delete - {name}", 3698 ) 3699 if model.delete_requester 3700 else None 3701 ) 3702 download_target_requester = ( 3703 self._create_component_from_model( 3704 model=model.download_target_requester, 3705 decoder=decoder, 3706 config=config, 3707 name=f"job extract_url - {name}", 3708 ) 3709 if model.download_target_requester 3710 else None 3711 ) 3712 status_extractor = self._create_component_from_model( 3713 model=model.status_extractor, decoder=decoder, config=config, name=name 3714 ) 3715 download_target_extractor = ( 3716 self._create_component_from_model( 3717 model=model.download_target_extractor, 3718 decoder=decoder, 3719 config=config, 3720 name=name, 3721 ) 3722 if model.download_target_extractor 3723 else None 3724 ) 3725 3726 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3727 creation_requester=creation_requester, 3728 polling_requester=polling_requester, 3729 download_retriever=download_retriever, 3730 download_target_requester=download_target_requester, 3731 abort_requester=abort_requester, 3732 delete_requester=delete_requester, 3733 status_extractor=status_extractor, 3734 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3735 download_target_extractor=download_target_extractor, 3736 job_timeout=_get_job_timeout(), 3737 ) 3738 3739 async_job_partition_router = AsyncJobPartitionRouter( 3740 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3741 job_repository, 3742 stream_slices, 3743 self._job_tracker, 3744 self._message_repository, 3745 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3746 has_bulk_parent=False, 3747 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3748 # `None` == default retry is set to 3 attempts, under the hood. 3749 job_max_retry=1 if self._emit_connector_builder_messages else None, 3750 ), 3751 stream_slicer=stream_slicer, 3752 config=config, 3753 parameters=model.parameters or {}, 3754 ) 3755 3756 return AsyncRetriever( 3757 record_selector=record_selector, 3758 stream_slicer=async_job_partition_router, 3759 config=config, 3760 parameters=model.parameters or {}, 3761 )
3763 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3764 config_migrations = [ 3765 self._create_component_from_model(migration, config) 3766 for migration in ( 3767 model.config_normalization_rules.config_migrations 3768 if ( 3769 model.config_normalization_rules 3770 and model.config_normalization_rules.config_migrations 3771 ) 3772 else [] 3773 ) 3774 ] 3775 config_transformations = [ 3776 self._create_component_from_model(transformation, config) 3777 for transformation in ( 3778 model.config_normalization_rules.transformations 3779 if ( 3780 model.config_normalization_rules 3781 and model.config_normalization_rules.transformations 3782 ) 3783 else [] 3784 ) 3785 ] 3786 config_validations = [ 3787 self._create_component_from_model(validation, config) 3788 for validation in ( 3789 model.config_normalization_rules.validations 3790 if ( 3791 model.config_normalization_rules 3792 and model.config_normalization_rules.validations 3793 ) 3794 else [] 3795 ) 3796 ] 3797 3798 return Spec( 3799 connection_specification=model.connection_specification, 3800 documentation_url=model.documentation_url, 3801 advanced_auth=model.advanced_auth, 3802 parameters={}, 3803 config_migrations=config_migrations, 3804 config_transformations=config_transformations, 3805 config_validations=config_validations, 3806 )
3808 def create_substream_partition_router( 3809 self, 3810 model: SubstreamPartitionRouterModel, 3811 config: Config, 3812 *, 3813 stream_name: str, 3814 **kwargs: Any, 3815 ) -> SubstreamPartitionRouter: 3816 parent_stream_configs = [] 3817 if model.parent_stream_configs: 3818 parent_stream_configs.extend( 3819 [ 3820 self.create_parent_stream_config_with_substream_wrapper( 3821 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 3822 ) 3823 for parent_stream_config in model.parent_stream_configs 3824 ] 3825 ) 3826 3827 return SubstreamPartitionRouter( 3828 parent_stream_configs=parent_stream_configs, 3829 parameters=model.parameters or {}, 3830 config=config, 3831 )
3833 def create_parent_stream_config_with_substream_wrapper( 3834 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3835 ) -> Any: 3836 # getting the parent state 3837 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 3838 3839 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3840 has_parent_state = bool( 3841 self._connector_state_manager.get_stream_state(stream_name, None) 3842 if model.incremental_dependency 3843 else False 3844 ) 3845 connector_state_manager = self._instantiate_parent_stream_state_manager( 3846 child_state, config, model, has_parent_state 3847 ) 3848 3849 substream_factory = ModelToComponentFactory( 3850 connector_state_manager=connector_state_manager, 3851 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3852 limit_slices_fetched=self._limit_slices_fetched, 3853 emit_connector_builder_messages=self._emit_connector_builder_messages, 3854 disable_retries=self._disable_retries, 3855 disable_cache=self._disable_cache, 3856 message_repository=StateFilteringMessageRepository( 3857 LogAppenderMessageRepositoryDecorator( 3858 { 3859 "airbyte_cdk": {"stream": {"is_substream": True}}, 3860 "http": {"is_auxiliary": True}, 3861 }, 3862 self._message_repository, 3863 self._evaluate_log_level(self._emit_connector_builder_messages), 3864 ), 3865 ), 3866 api_budget=self._api_budget, 3867 ) 3868 3869 return substream_factory.create_parent_stream_config( 3870 model=model, config=config, stream_name=stream_name, **kwargs 3871 )
3934 @staticmethod 3935 def create_wait_time_from_header( 3936 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3937 ) -> WaitTimeFromHeaderBackoffStrategy: 3938 return WaitTimeFromHeaderBackoffStrategy( 3939 header=model.header, 3940 parameters=model.parameters or {}, 3941 config=config, 3942 regex=model.regex, 3943 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3944 if model.max_waiting_time_in_seconds is not None 3945 else None, 3946 )
3948 @staticmethod 3949 def create_wait_until_time_from_header( 3950 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3951 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3952 return WaitUntilTimeFromHeaderBackoffStrategy( 3953 header=model.header, 3954 parameters=model.parameters or {}, 3955 config=config, 3956 min_wait=model.min_wait, 3957 regex=model.regex, 3958 )
3966 @staticmethod 3967 def create_components_mapping_definition( 3968 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3969 ) -> ComponentMappingDefinition: 3970 interpolated_value = InterpolatedString.create( 3971 model.value, parameters=model.parameters or {} 3972 ) 3973 field_path = [ 3974 InterpolatedString.create(path, parameters=model.parameters or {}) 3975 for path in model.field_path 3976 ] 3977 return ComponentMappingDefinition( 3978 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3979 value=interpolated_value, 3980 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3981 create_or_update=model.create_or_update, 3982 condition=model.condition, 3983 parameters=model.parameters or {}, 3984 )
3986 def create_http_components_resolver( 3987 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 3988 ) -> Any: 3989 retriever = self._create_component_from_model( 3990 model=model.retriever, 3991 config=config, 3992 name=f"{stream_name if stream_name else '__http_components_resolver'}", 3993 primary_key=None, 3994 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 3995 transformations=[], 3996 ) 3997 3998 components_mapping = [] 3999 for component_mapping_definition_model in model.components_mapping: 4000 if component_mapping_definition_model.condition: 4001 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4002 components_mapping.append( 4003 self._create_component_from_model( 4004 model=component_mapping_definition_model, 4005 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4006 component_mapping_definition_model.value_type 4007 ), 4008 config=config, 4009 ) 4010 ) 4011 4012 return HttpComponentsResolver( 4013 retriever=retriever, 4014 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4015 config=config, 4016 components_mapping=components_mapping, 4017 parameters=model.parameters or {}, 4018 )
4020 @staticmethod 4021 def create_stream_config( 4022 model: StreamConfigModel, config: Config, **kwargs: Any 4023 ) -> StreamConfig: 4024 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4025 [x for x in model.configs_pointer] if model.configs_pointer else [] 4026 ) 4027 4028 return StreamConfig( 4029 configs_pointer=model_configs_pointer, 4030 default_values=model.default_values, 4031 parameters=model.parameters or {}, 4032 )
4034 def create_config_components_resolver( 4035 self, 4036 model: ConfigComponentsResolverModel, 4037 config: Config, 4038 ) -> Any: 4039 model_stream_configs = ( 4040 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4041 ) 4042 4043 stream_configs = [ 4044 self._create_component_from_model( 4045 stream_config, config=config, parameters=model.parameters or {} 4046 ) 4047 for stream_config in model_stream_configs 4048 ] 4049 4050 components_mapping = [ 4051 self._create_component_from_model( 4052 model=components_mapping_definition_model, 4053 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4054 components_mapping_definition_model.value_type 4055 ), 4056 config=config, 4057 parameters=model.parameters, 4058 ) 4059 for components_mapping_definition_model in model.components_mapping 4060 ] 4061 4062 return ConfigComponentsResolver( 4063 stream_configs=stream_configs, 4064 config=config, 4065 components_mapping=components_mapping, 4066 parameters=model.parameters or {}, 4067 )
4069 def create_parametrized_components_resolver( 4070 self, 4071 model: ParametrizedComponentsResolverModel, 4072 config: Config, 4073 ) -> ParametrizedComponentsResolver: 4074 stream_parameters = StreamParametersDefinition( 4075 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4076 ) 4077 4078 components_mapping = [] 4079 for components_mapping_definition_model in model.components_mapping: 4080 if components_mapping_definition_model.condition: 4081 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4082 components_mapping.append( 4083 self._create_component_from_model( 4084 model=components_mapping_definition_model, 4085 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4086 components_mapping_definition_model.value_type 4087 ), 4088 config=config, 4089 ) 4090 ) 4091 return ParametrizedComponentsResolver( 4092 stream_parameters=stream_parameters, 4093 config=config, 4094 components_mapping=components_mapping, 4095 parameters=model.parameters or {}, 4096 )
4120 def create_http_api_budget( 4121 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4122 ) -> HttpAPIBudget: 4123 policies = [ 4124 self._create_component_from_model(model=policy, config=config) 4125 for policy in model.policies 4126 ] 4127 4128 return HttpAPIBudget( 4129 policies=policies, 4130 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4131 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4132 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4133 )
4135 def create_fixed_window_call_rate_policy( 4136 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4137 ) -> FixedWindowCallRatePolicy: 4138 matchers = [ 4139 self._create_component_from_model(model=matcher, config=config) 4140 for matcher in model.matchers 4141 ] 4142 4143 # Set the initial reset timestamp to 10 days from now. 4144 # This value will be updated by the first request. 4145 return FixedWindowCallRatePolicy( 4146 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4147 period=parse_duration(model.period), 4148 call_limit=model.call_limit, 4149 matchers=matchers, 4150 )
4152 def create_file_uploader( 4153 self, model: FileUploaderModel, config: Config, **kwargs: Any 4154 ) -> FileUploader: 4155 name = "File Uploader" 4156 requester = self._create_component_from_model( 4157 model=model.requester, 4158 config=config, 4159 name=name, 4160 **kwargs, 4161 ) 4162 download_target_extractor = self._create_component_from_model( 4163 model=model.download_target_extractor, 4164 config=config, 4165 name=name, 4166 **kwargs, 4167 ) 4168 emit_connector_builder_messages = self._emit_connector_builder_messages 4169 file_uploader = DefaultFileUploader( 4170 requester=requester, 4171 download_target_extractor=download_target_extractor, 4172 config=config, 4173 file_writer=NoopFileWriter() 4174 if emit_connector_builder_messages 4175 else LocalFileSystemFileWriter(), 4176 parameters=model.parameters or {}, 4177 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4178 ) 4179 4180 return ( 4181 ConnectorBuilderFileUploader(file_uploader) 4182 if emit_connector_builder_messages 4183 else file_uploader 4184 )
4186 def create_moving_window_call_rate_policy( 4187 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4188 ) -> MovingWindowCallRatePolicy: 4189 rates = [ 4190 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4191 ] 4192 matchers = [ 4193 self._create_component_from_model(model=matcher, config=config) 4194 for matcher in model.matchers 4195 ] 4196 return MovingWindowCallRatePolicy( 4197 rates=rates, 4198 matchers=matchers, 4199 )
4201 def create_unlimited_call_rate_policy( 4202 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4203 ) -> UnlimitedCallRatePolicy: 4204 matchers = [ 4205 self._create_component_from_model(model=matcher, config=config) 4206 for matcher in model.matchers 4207 ] 4208 4209 return UnlimitedCallRatePolicy( 4210 matchers=matchers, 4211 )
4220 def create_http_request_matcher( 4221 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4222 ) -> HttpRequestRegexMatcher: 4223 return HttpRequestRegexMatcher( 4224 method=model.method, 4225 url_base=model.url_base, 4226 url_path_pattern=model.url_path_pattern, 4227 params=model.params, 4228 headers=model.headers, 4229 )
4236 def create_grouping_partition_router( 4237 self, 4238 model: GroupingPartitionRouterModel, 4239 config: Config, 4240 *, 4241 stream_name: str, 4242 **kwargs: Any, 4243 ) -> GroupingPartitionRouter: 4244 underlying_router = self._create_component_from_model( 4245 model=model.underlying_partition_router, 4246 config=config, 4247 stream_name=stream_name, 4248 **kwargs, 4249 ) 4250 if model.group_size < 1: 4251 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4252 4253 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4254 # because they are specific to individual partitions and cannot be aggregated or handled 4255 # when grouping, potentially leading to incorrect API calls. Any request customization 4256 # should be managed at the stream level through the requester's configuration. 4257 if isinstance(underlying_router, SubstreamPartitionRouter): 4258 if any( 4259 parent_config.request_option 4260 for parent_config in underlying_router.parent_stream_configs 4261 ): 4262 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4263 4264 if isinstance(underlying_router, ListPartitionRouter): 4265 if underlying_router.request_option: 4266 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4267 4268 return GroupingPartitionRouter( 4269 group_size=model.group_size, 4270 underlying_partition_router=underlying_router, 4271 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4272 config=config, 4273 )