airbyte_cdk.sources.declarative.parsers.model_to_component_factory
1# 2# Copyright (c) 2025 Airbyte, Inc., all rights reserved. 3# 4 5from __future__ import annotations 6 7import datetime 8import importlib 9import inspect 10import re 11from functools import partial 12from typing import ( 13 Any, 14 Callable, 15 Dict, 16 List, 17 Mapping, 18 MutableMapping, 19 Optional, 20 Type, 21 Union, 22 cast, 23 get_args, 24 get_origin, 25 get_type_hints, 26) 27 28from isodate import parse_duration 29from pydantic.v1 import BaseModel 30from requests import Response 31 32from airbyte_cdk.connector_builder.models import ( 33 LogMessage as ConnectorBuilderLogMessage, 34) 35from airbyte_cdk.models import FailureType, Level 36from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager 37from airbyte_cdk.sources.declarative import transformations 38from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator 39from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker 40from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository 41from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus 42from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator 43from airbyte_cdk.sources.declarative.auth.declarative_authenticator import ( 44 DeclarativeAuthenticator, 45 NoAuth, 46) 47from airbyte_cdk.sources.declarative.auth.jwt import JwtAlgorithm 48from airbyte_cdk.sources.declarative.auth.oauth import ( 49 DeclarativeSingleUseRefreshTokenOauth2Authenticator, 50) 51from airbyte_cdk.sources.declarative.auth.selective_authenticator import SelectiveAuthenticator 52from airbyte_cdk.sources.declarative.auth.token import ( 53 ApiKeyAuthenticator, 54 BasicHttpAuthenticator, 55 BearerAuthenticator, 56 LegacySessionTokenAuthenticator, 57) 58from airbyte_cdk.sources.declarative.auth.token_provider import ( 59 InterpolatedStringTokenProvider, 60 SessionTokenProvider, 61 TokenProvider, 62) 63from airbyte_cdk.sources.declarative.checks import ( 64 CheckDynamicStream, 65 CheckStream, 66 DynamicStreamCheckConfig, 67) 68from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel 69from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime 70from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream 71from airbyte_cdk.sources.declarative.decoders import ( 72 Decoder, 73 IterableDecoder, 74 JsonDecoder, 75 PaginationDecoderDecorator, 76 XmlDecoder, 77 ZipfileDecoder, 78) 79from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import ( 80 CompositeRawDecoder, 81 CsvParser, 82 GzipParser, 83 JsonLineParser, 84 JsonParser, 85 Parser, 86) 87from airbyte_cdk.sources.declarative.extractors import ( 88 DpathExtractor, 89 RecordFilter, 90 RecordSelector, 91 ResponseToFileExtractor, 92) 93from airbyte_cdk.sources.declarative.extractors.record_filter import ( 94 ClientSideIncrementalRecordFilterDecorator, 95) 96from airbyte_cdk.sources.declarative.incremental import ( 97 ChildPartitionResumableFullRefreshCursor, 98 ConcurrentCursorFactory, 99 ConcurrentPerPartitionCursor, 100 CursorFactory, 101 DatetimeBasedCursor, 102 DeclarativeCursor, 103 GlobalSubstreamCursor, 104 PerPartitionCursor, 105 PerPartitionWithGlobalCursor, 106 ResumableFullRefreshCursor, 107) 108from airbyte_cdk.sources.declarative.interpolation import InterpolatedString 109from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping 110from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import ( 111 LegacyToPerPartitionStateMigration, 112) 113from airbyte_cdk.sources.declarative.models import ( 114 CustomStateMigration, 115) 116from airbyte_cdk.sources.declarative.models.base_model_with_deprecations import ( 117 DEPRECATION_LOGS_TAG, 118 BaseModelWithDeprecations, 119) 120from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 121 AddedFieldDefinition as AddedFieldDefinitionModel, 122) 123from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 124 AddFields as AddFieldsModel, 125) 126from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 127 ApiKeyAuthenticator as ApiKeyAuthenticatorModel, 128) 129from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 130 AsyncJobStatusMap as AsyncJobStatusMapModel, 131) 132from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 133 AsyncRetriever as AsyncRetrieverModel, 134) 135from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 136 BasicHttpAuthenticator as BasicHttpAuthenticatorModel, 137) 138from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 139 BearerAuthenticator as BearerAuthenticatorModel, 140) 141from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 142 CheckDynamicStream as CheckDynamicStreamModel, 143) 144from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 145 CheckStream as CheckStreamModel, 146) 147from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 148 ComplexFieldType as ComplexFieldTypeModel, 149) 150from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 151 ComponentMappingDefinition as ComponentMappingDefinitionModel, 152) 153from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 154 CompositeErrorHandler as CompositeErrorHandlerModel, 155) 156from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 157 ConcurrencyLevel as ConcurrencyLevelModel, 158) 159from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 160 ConfigAddFields as ConfigAddFieldsModel, 161) 162from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 163 ConfigComponentsResolver as ConfigComponentsResolverModel, 164) 165from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 166 ConfigMigration as ConfigMigrationModel, 167) 168from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 169 ConfigRemapField as ConfigRemapFieldModel, 170) 171from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 172 ConfigRemoveFields as ConfigRemoveFieldsModel, 173) 174from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 175 ConstantBackoffStrategy as ConstantBackoffStrategyModel, 176) 177from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 178 CsvDecoder as CsvDecoderModel, 179) 180from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 181 CursorPagination as CursorPaginationModel, 182) 183from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 184 CustomAuthenticator as CustomAuthenticatorModel, 185) 186from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 187 CustomBackoffStrategy as CustomBackoffStrategyModel, 188) 189from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 190 CustomConfigTransformation as CustomConfigTransformationModel, 191) 192from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 193 CustomDecoder as CustomDecoderModel, 194) 195from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 196 CustomErrorHandler as CustomErrorHandlerModel, 197) 198from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 199 CustomIncrementalSync as CustomIncrementalSyncModel, 200) 201from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 202 CustomPaginationStrategy as CustomPaginationStrategyModel, 203) 204from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 205 CustomPartitionRouter as CustomPartitionRouterModel, 206) 207from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 208 CustomRecordExtractor as CustomRecordExtractorModel, 209) 210from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 211 CustomRecordFilter as CustomRecordFilterModel, 212) 213from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 214 CustomRequester as CustomRequesterModel, 215) 216from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 217 CustomRetriever as CustomRetrieverModel, 218) 219from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 220 CustomSchemaLoader as CustomSchemaLoader, 221) 222from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 223 CustomSchemaNormalization as CustomSchemaNormalizationModel, 224) 225from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 226 CustomTransformation as CustomTransformationModel, 227) 228from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 229 CustomValidationStrategy as CustomValidationStrategyModel, 230) 231from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 232 DatetimeBasedCursor as DatetimeBasedCursorModel, 233) 234from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 235 DeclarativeStream as DeclarativeStreamModel, 236) 237from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 238 DefaultErrorHandler as DefaultErrorHandlerModel, 239) 240from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 241 DefaultPaginator as DefaultPaginatorModel, 242) 243from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 244 DpathExtractor as DpathExtractorModel, 245) 246from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 247 DpathFlattenFields as DpathFlattenFieldsModel, 248) 249from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 250 DpathValidator as DpathValidatorModel, 251) 252from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 253 DynamicSchemaLoader as DynamicSchemaLoaderModel, 254) 255from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 256 DynamicStreamCheckConfig as DynamicStreamCheckConfigModel, 257) 258from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 259 ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, 260) 261from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 262 FileUploader as FileUploaderModel, 263) 264from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 265 FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel, 266) 267from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 268 FlattenFields as FlattenFieldsModel, 269) 270from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 271 GroupByKeyMergeStrategy as GroupByKeyMergeStrategyModel, 272) 273from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 274 GroupingPartitionRouter as GroupingPartitionRouterModel, 275) 276from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 277 GzipDecoder as GzipDecoderModel, 278) 279from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 280 HTTPAPIBudget as HTTPAPIBudgetModel, 281) 282from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 283 HttpComponentsResolver as HttpComponentsResolverModel, 284) 285from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 286 HttpRequester as HttpRequesterModel, 287) 288from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 289 HttpRequestRegexMatcher as HttpRequestRegexMatcherModel, 290) 291from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 292 HttpResponseFilter as HttpResponseFilterModel, 293) 294from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 295 IncrementingCountCursor as IncrementingCountCursorModel, 296) 297from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 298 InlineSchemaLoader as InlineSchemaLoaderModel, 299) 300from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 301 IterableDecoder as IterableDecoderModel, 302) 303from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 304 JsonDecoder as JsonDecoderModel, 305) 306from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 307 JsonFileSchemaLoader as JsonFileSchemaLoaderModel, 308) 309from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 310 JsonlDecoder as JsonlDecoderModel, 311) 312from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 313 JwtAuthenticator as JwtAuthenticatorModel, 314) 315from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 316 JwtHeaders as JwtHeadersModel, 317) 318from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 319 JwtPayload as JwtPayloadModel, 320) 321from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 322 KeysReplace as KeysReplaceModel, 323) 324from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 325 KeysToLower as KeysToLowerModel, 326) 327from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 328 KeysToSnakeCase as KeysToSnakeCaseModel, 329) 330from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 331 LegacySessionTokenAuthenticator as LegacySessionTokenAuthenticatorModel, 332) 333from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 334 LegacyToPerPartitionStateMigration as LegacyToPerPartitionStateMigrationModel, 335) 336from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 337 ListPartitionRouter as ListPartitionRouterModel, 338) 339from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 340 MinMaxDatetime as MinMaxDatetimeModel, 341) 342from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 343 MovingWindowCallRatePolicy as MovingWindowCallRatePolicyModel, 344) 345from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 346 NoAuth as NoAuthModel, 347) 348from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 349 NoPagination as NoPaginationModel, 350) 351from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 352 OAuthAuthenticator as OAuthAuthenticatorModel, 353) 354from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 355 OffsetIncrement as OffsetIncrementModel, 356) 357from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 358 PageIncrement as PageIncrementModel, 359) 360from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 361 ParametrizedComponentsResolver as ParametrizedComponentsResolverModel, 362) 363from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 364 ParentStreamConfig as ParentStreamConfigModel, 365) 366from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 367 PredicateValidator as PredicateValidatorModel, 368) 369from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 370 PropertiesFromEndpoint as PropertiesFromEndpointModel, 371) 372from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 373 PropertyChunking as PropertyChunkingModel, 374) 375from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 376 PropertyLimitType as PropertyLimitTypeModel, 377) 378from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 379 QueryProperties as QueryPropertiesModel, 380) 381from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 382 Rate as RateModel, 383) 384from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 385 RecordFilter as RecordFilterModel, 386) 387from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 388 RecordSelector as RecordSelectorModel, 389) 390from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 391 RemoveFields as RemoveFieldsModel, 392) 393from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 394 RequestOption as RequestOptionModel, 395) 396from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 397 RequestPath as RequestPathModel, 398) 399from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 400 ResponseToFileExtractor as ResponseToFileExtractorModel, 401) 402from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 403 SchemaNormalization as SchemaNormalizationModel, 404) 405from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 406 SchemaTypeIdentifier as SchemaTypeIdentifierModel, 407) 408from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 409 SelectiveAuthenticator as SelectiveAuthenticatorModel, 410) 411from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 412 SessionTokenAuthenticator as SessionTokenAuthenticatorModel, 413) 414from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 415 SimpleRetriever as SimpleRetrieverModel, 416) 417from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel 418from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 419 StateDelegatingStream as StateDelegatingStreamModel, 420) 421from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 422 StreamConfig as StreamConfigModel, 423) 424from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 425 SubstreamPartitionRouter as SubstreamPartitionRouterModel, 426) 427from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 428 TypesMap as TypesMapModel, 429) 430from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 431 UnlimitedCallRatePolicy as UnlimitedCallRatePolicyModel, 432) 433from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 434 ValidateAdheresToSchema as ValidateAdheresToSchemaModel, 435) 436from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType 437from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 438 WaitTimeFromHeader as WaitTimeFromHeaderModel, 439) 440from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 441 WaitUntilTimeFromHeader as WaitUntilTimeFromHeaderModel, 442) 443from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 444 XmlDecoder as XmlDecoderModel, 445) 446from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 447 ZipfileDecoder as ZipfileDecoderModel, 448) 449from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import ( 450 COMPONENTS_MODULE_NAME, 451 SDM_COMPONENTS_MODULE_NAME, 452) 453from airbyte_cdk.sources.declarative.partition_routers import ( 454 CartesianProductStreamSlicer, 455 GroupingPartitionRouter, 456 ListPartitionRouter, 457 PartitionRouter, 458 SinglePartitionRouter, 459 SubstreamPartitionRouter, 460) 461from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import ( 462 AsyncJobPartitionRouter, 463) 464from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( 465 ParentStreamConfig, 466) 467from airbyte_cdk.sources.declarative.requesters import HttpRequester, RequestOption 468from airbyte_cdk.sources.declarative.requesters.error_handlers import ( 469 CompositeErrorHandler, 470 DefaultErrorHandler, 471 HttpResponseFilter, 472) 473from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ( 474 ConstantBackoffStrategy, 475 ExponentialBackoffStrategy, 476 WaitTimeFromHeaderBackoffStrategy, 477 WaitUntilTimeFromHeaderBackoffStrategy, 478) 479from airbyte_cdk.sources.declarative.requesters.http_job_repository import AsyncHttpJobRepository 480from airbyte_cdk.sources.declarative.requesters.paginators import ( 481 DefaultPaginator, 482 NoPagination, 483 PaginatorTestReadDecorator, 484) 485from airbyte_cdk.sources.declarative.requesters.paginators.strategies import ( 486 CursorPaginationStrategy, 487 CursorStopCondition, 488 OffsetIncrement, 489 PageIncrement, 490 StopConditionPaginationStrategyDecorator, 491) 492from airbyte_cdk.sources.declarative.requesters.query_properties import ( 493 PropertiesFromEndpoint, 494 PropertyChunking, 495 QueryProperties, 496) 497from airbyte_cdk.sources.declarative.requesters.query_properties.property_chunking import ( 498 PropertyLimitType, 499) 500from airbyte_cdk.sources.declarative.requesters.query_properties.strategies import ( 501 GroupByKey, 502) 503from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType 504from airbyte_cdk.sources.declarative.requesters.request_options import ( 505 DatetimeBasedRequestOptionsProvider, 506 DefaultRequestOptionsProvider, 507 InterpolatedRequestOptionsProvider, 508 RequestOptionsProvider, 509) 510from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath 511from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester 512from airbyte_cdk.sources.declarative.resolvers import ( 513 ComponentMappingDefinition, 514 ConfigComponentsResolver, 515 HttpComponentsResolver, 516 ParametrizedComponentsResolver, 517 StreamConfig, 518 StreamParametersDefinition, 519) 520from airbyte_cdk.sources.declarative.retrievers import ( 521 AsyncRetriever, 522 LazySimpleRetriever, 523 SimpleRetriever, 524) 525from airbyte_cdk.sources.declarative.retrievers.file_uploader import ( 526 ConnectorBuilderFileUploader, 527 DefaultFileUploader, 528 FileUploader, 529 LocalFileSystemFileWriter, 530 NoopFileWriter, 531) 532from airbyte_cdk.sources.declarative.schema import ( 533 ComplexFieldType, 534 DefaultSchemaLoader, 535 DynamicSchemaLoader, 536 InlineSchemaLoader, 537 JsonFileSchemaLoader, 538 SchemaTypeIdentifier, 539 TypesMap, 540) 541from airbyte_cdk.sources.declarative.schema.composite_schema_loader import CompositeSchemaLoader 542from airbyte_cdk.sources.declarative.spec import ConfigMigration, Spec 543from airbyte_cdk.sources.declarative.stream_slicers import ( 544 StreamSlicer, 545 StreamSlicerTestReadDecorator, 546) 547from airbyte_cdk.sources.declarative.transformations import ( 548 AddFields, 549 RecordTransformation, 550 RemoveFields, 551) 552from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition 553from airbyte_cdk.sources.declarative.transformations.config_transformations import ( 554 ConfigAddFields, 555 ConfigRemapField, 556 ConfigRemoveFields, 557) 558from airbyte_cdk.sources.declarative.transformations.config_transformations.config_transformation import ( 559 ConfigTransformation, 560) 561from airbyte_cdk.sources.declarative.transformations.dpath_flatten_fields import ( 562 DpathFlattenFields, 563 KeyTransformation, 564) 565from airbyte_cdk.sources.declarative.transformations.flatten_fields import ( 566 FlattenFields, 567) 568from airbyte_cdk.sources.declarative.transformations.keys_replace_transformation import ( 569 KeysReplaceTransformation, 570) 571from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import ( 572 KeysToLowerTransformation, 573) 574from airbyte_cdk.sources.declarative.transformations.keys_to_snake_transformation import ( 575 KeysToSnakeCaseTransformation, 576) 577from airbyte_cdk.sources.declarative.validators import ( 578 DpathValidator, 579 PredicateValidator, 580 ValidateAdheresToSchema, 581) 582from airbyte_cdk.sources.http_logger import format_http_message 583from airbyte_cdk.sources.message import ( 584 InMemoryMessageRepository, 585 LogAppenderMessageRepositoryDecorator, 586 MessageRepository, 587 NoopMessageRepository, 588) 589from airbyte_cdk.sources.streams.call_rate import ( 590 APIBudget, 591 FixedWindowCallRatePolicy, 592 HttpAPIBudget, 593 HttpRequestRegexMatcher, 594 MovingWindowCallRatePolicy, 595 Rate, 596 UnlimitedCallRatePolicy, 597) 598from airbyte_cdk.sources.streams.concurrent.clamping import ( 599 ClampingEndProvider, 600 ClampingStrategy, 601 DayClampingStrategy, 602 MonthClampingStrategy, 603 NoClamping, 604 WeekClampingStrategy, 605 Weekday, 606) 607from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField 608from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( 609 CustomFormatConcurrentStreamStateConverter, 610 DateTimeStreamStateConverter, 611) 612from airbyte_cdk.sources.streams.concurrent.state_converters.incrementing_count_stream_state_converter import ( 613 IncrementingCountStreamStateConverter, 614) 615from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction 616from airbyte_cdk.sources.types import Config 617from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer 618 619ComponentDefinition = Mapping[str, Any] 620 621SCHEMA_TRANSFORMER_TYPE_MAPPING = { 622 SchemaNormalizationModel.None_: TransformConfig.NoTransform, 623 SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization, 624} 625 626 627class ModelToComponentFactory: 628 EPOCH_DATETIME_FORMAT = "%s" 629 630 def __init__( 631 self, 632 limit_pages_fetched_per_slice: Optional[int] = None, 633 limit_slices_fetched: Optional[int] = None, 634 emit_connector_builder_messages: bool = False, 635 disable_retries: bool = False, 636 disable_cache: bool = False, 637 disable_resumable_full_refresh: bool = False, 638 message_repository: Optional[MessageRepository] = None, 639 connector_state_manager: Optional[ConnectorStateManager] = None, 640 max_concurrent_async_job_count: Optional[int] = None, 641 ): 642 self._init_mappings() 643 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 644 self._limit_slices_fetched = limit_slices_fetched 645 self._emit_connector_builder_messages = emit_connector_builder_messages 646 self._disable_retries = disable_retries 647 self._disable_cache = disable_cache 648 self._disable_resumable_full_refresh = disable_resumable_full_refresh 649 self._message_repository = message_repository or InMemoryMessageRepository( 650 self._evaluate_log_level(emit_connector_builder_messages) 651 ) 652 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 653 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 654 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 655 # placeholder for deprecation warnings 656 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 657 658 def _init_mappings(self) -> None: 659 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 660 AddedFieldDefinitionModel: self.create_added_field_definition, 661 AddFieldsModel: self.create_add_fields, 662 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 663 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 664 BearerAuthenticatorModel: self.create_bearer_authenticator, 665 CheckStreamModel: self.create_check_stream, 666 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 667 CheckDynamicStreamModel: self.create_check_dynamic_stream, 668 CompositeErrorHandlerModel: self.create_composite_error_handler, 669 ConcurrencyLevelModel: self.create_concurrency_level, 670 ConfigMigrationModel: self.create_config_migration, 671 ConfigAddFieldsModel: self.create_config_add_fields, 672 ConfigRemapFieldModel: self.create_config_remap_field, 673 ConfigRemoveFieldsModel: self.create_config_remove_fields, 674 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 675 CsvDecoderModel: self.create_csv_decoder, 676 CursorPaginationModel: self.create_cursor_pagination, 677 CustomAuthenticatorModel: self.create_custom_component, 678 CustomBackoffStrategyModel: self.create_custom_component, 679 CustomDecoderModel: self.create_custom_component, 680 CustomErrorHandlerModel: self.create_custom_component, 681 CustomIncrementalSyncModel: self.create_custom_component, 682 CustomRecordExtractorModel: self.create_custom_component, 683 CustomRecordFilterModel: self.create_custom_component, 684 CustomRequesterModel: self.create_custom_component, 685 CustomRetrieverModel: self.create_custom_component, 686 CustomSchemaLoader: self.create_custom_component, 687 CustomSchemaNormalizationModel: self.create_custom_component, 688 CustomStateMigration: self.create_custom_component, 689 CustomPaginationStrategyModel: self.create_custom_component, 690 CustomPartitionRouterModel: self.create_custom_component, 691 CustomTransformationModel: self.create_custom_component, 692 CustomValidationStrategyModel: self.create_custom_component, 693 CustomConfigTransformationModel: self.create_custom_component, 694 DatetimeBasedCursorModel: self.create_datetime_based_cursor, 695 DeclarativeStreamModel: self.create_declarative_stream, 696 DefaultErrorHandlerModel: self.create_default_error_handler, 697 DefaultPaginatorModel: self.create_default_paginator, 698 DpathExtractorModel: self.create_dpath_extractor, 699 DpathValidatorModel: self.create_dpath_validator, 700 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 701 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 702 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 703 GroupByKeyMergeStrategyModel: self.create_group_by_key, 704 HttpRequesterModel: self.create_http_requester, 705 HttpResponseFilterModel: self.create_http_response_filter, 706 InlineSchemaLoaderModel: self.create_inline_schema_loader, 707 JsonDecoderModel: self.create_json_decoder, 708 JsonlDecoderModel: self.create_jsonl_decoder, 709 GzipDecoderModel: self.create_gzip_decoder, 710 KeysToLowerModel: self.create_keys_to_lower_transformation, 711 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 712 KeysReplaceModel: self.create_keys_replace_transformation, 713 FlattenFieldsModel: self.create_flatten_fields, 714 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 715 IterableDecoderModel: self.create_iterable_decoder, 716 IncrementingCountCursorModel: self.create_incrementing_count_cursor, 717 XmlDecoderModel: self.create_xml_decoder, 718 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 719 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 720 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 721 TypesMapModel: self.create_types_map, 722 ComplexFieldTypeModel: self.create_complex_field_type, 723 JwtAuthenticatorModel: self.create_jwt_authenticator, 724 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 725 ListPartitionRouterModel: self.create_list_partition_router, 726 MinMaxDatetimeModel: self.create_min_max_datetime, 727 NoAuthModel: self.create_no_auth, 728 NoPaginationModel: self.create_no_pagination, 729 OAuthAuthenticatorModel: self.create_oauth_authenticator, 730 OffsetIncrementModel: self.create_offset_increment, 731 PageIncrementModel: self.create_page_increment, 732 ParentStreamConfigModel: self.create_parent_stream_config, 733 PredicateValidatorModel: self.create_predicate_validator, 734 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 735 PropertyChunkingModel: self.create_property_chunking, 736 QueryPropertiesModel: self.create_query_properties, 737 RecordFilterModel: self.create_record_filter, 738 RecordSelectorModel: self.create_record_selector, 739 RemoveFieldsModel: self.create_remove_fields, 740 RequestPathModel: self.create_request_path, 741 RequestOptionModel: self.create_request_option, 742 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 743 SelectiveAuthenticatorModel: self.create_selective_authenticator, 744 SimpleRetrieverModel: self.create_simple_retriever, 745 StateDelegatingStreamModel: self.create_state_delegating_stream, 746 SpecModel: self.create_spec, 747 SubstreamPartitionRouterModel: self.create_substream_partition_router, 748 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 749 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 750 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 751 AsyncRetrieverModel: self.create_async_retriever, 752 HttpComponentsResolverModel: self.create_http_components_resolver, 753 ConfigComponentsResolverModel: self.create_config_components_resolver, 754 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 755 StreamConfigModel: self.create_stream_config, 756 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 757 ZipfileDecoderModel: self.create_zipfile_decoder, 758 HTTPAPIBudgetModel: self.create_http_api_budget, 759 FileUploaderModel: self.create_file_uploader, 760 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 761 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 762 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 763 RateModel: self.create_rate, 764 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 765 GroupingPartitionRouterModel: self.create_grouping_partition_router, 766 } 767 768 # Needed for the case where we need to perform a second parse on the fields of a custom component 769 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 770 771 def create_component( 772 self, 773 model_type: Type[BaseModel], 774 component_definition: ComponentDefinition, 775 config: Config, 776 **kwargs: Any, 777 ) -> Any: 778 """ 779 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 780 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 781 creating declarative components from that model. 782 783 :param model_type: The type of declarative component that is being initialized 784 :param component_definition: The mapping that represents a declarative component 785 :param config: The connector config that is provided by the customer 786 :return: The declarative component to be used at runtime 787 """ 788 789 component_type = component_definition.get("type") 790 if component_definition.get("type") != model_type.__name__: 791 raise ValueError( 792 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 793 ) 794 795 declarative_component_model = model_type.parse_obj(component_definition) 796 797 if not isinstance(declarative_component_model, model_type): 798 raise ValueError( 799 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 800 ) 801 802 return self._create_component_from_model( 803 model=declarative_component_model, config=config, **kwargs 804 ) 805 806 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 807 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 808 raise ValueError( 809 f"{model.__class__} with attributes {model} is not a valid component type" 810 ) 811 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 812 if not component_constructor: 813 raise ValueError(f"Could not find constructor for {model.__class__}") 814 815 # collect deprecation warnings for supported models. 816 if isinstance(model, BaseModelWithDeprecations): 817 self._collect_model_deprecations(model) 818 819 return component_constructor(model=model, config=config, **kwargs) 820 821 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 822 """ 823 Returns the deprecation warnings that were collected during the creation of components. 824 """ 825 return self._collected_deprecation_logs 826 827 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 828 """ 829 Collects deprecation logs from the given model and appends any new logs to the internal collection. 830 831 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 832 833 Args: 834 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 835 """ 836 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 837 for log in model._deprecation_logs: 838 # avoid duplicates for deprecation logs observed. 839 if log not in self._collected_deprecation_logs: 840 self._collected_deprecation_logs.append(log) 841 842 def create_config_migration( 843 self, model: ConfigMigrationModel, config: Config 844 ) -> ConfigMigration: 845 transformations: List[ConfigTransformation] = [ 846 self._create_component_from_model(transformation, config) 847 for transformation in model.transformations 848 ] 849 850 return ConfigMigration( 851 description=model.description, 852 transformations=transformations, 853 ) 854 855 def create_config_add_fields( 856 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 857 ) -> ConfigAddFields: 858 fields = [self._create_component_from_model(field, config) for field in model.fields] 859 return ConfigAddFields( 860 fields=fields, 861 condition=model.condition or "", 862 ) 863 864 @staticmethod 865 def create_config_remove_fields( 866 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 867 ) -> ConfigRemoveFields: 868 return ConfigRemoveFields( 869 field_pointers=model.field_pointers, 870 condition=model.condition or "", 871 ) 872 873 @staticmethod 874 def create_config_remap_field( 875 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 876 ) -> ConfigRemapField: 877 mapping = cast(Mapping[str, Any], model.map) 878 return ConfigRemapField( 879 map=mapping, 880 field_path=model.field_path, 881 config=config, 882 ) 883 884 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 885 strategy = self._create_component_from_model(model.validation_strategy, config) 886 887 return DpathValidator( 888 field_path=model.field_path, 889 strategy=strategy, 890 ) 891 892 def create_predicate_validator( 893 self, model: PredicateValidatorModel, config: Config 894 ) -> PredicateValidator: 895 strategy = self._create_component_from_model(model.validation_strategy, config) 896 897 return PredicateValidator( 898 value=model.value, 899 strategy=strategy, 900 ) 901 902 @staticmethod 903 def create_validate_adheres_to_schema( 904 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 905 ) -> ValidateAdheresToSchema: 906 base_schema = cast(Mapping[str, Any], model.base_schema) 907 return ValidateAdheresToSchema( 908 schema=base_schema, 909 ) 910 911 @staticmethod 912 def create_added_field_definition( 913 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 914 ) -> AddedFieldDefinition: 915 interpolated_value = InterpolatedString.create( 916 model.value, parameters=model.parameters or {} 917 ) 918 return AddedFieldDefinition( 919 path=model.path, 920 value=interpolated_value, 921 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 922 parameters=model.parameters or {}, 923 ) 924 925 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 926 added_field_definitions = [ 927 self._create_component_from_model( 928 model=added_field_definition_model, 929 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 930 added_field_definition_model.value_type 931 ), 932 config=config, 933 ) 934 for added_field_definition_model in model.fields 935 ] 936 return AddFields( 937 fields=added_field_definitions, 938 condition=model.condition or "", 939 parameters=model.parameters or {}, 940 ) 941 942 def create_keys_to_lower_transformation( 943 self, model: KeysToLowerModel, config: Config, **kwargs: Any 944 ) -> KeysToLowerTransformation: 945 return KeysToLowerTransformation() 946 947 def create_keys_to_snake_transformation( 948 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 949 ) -> KeysToSnakeCaseTransformation: 950 return KeysToSnakeCaseTransformation() 951 952 def create_keys_replace_transformation( 953 self, model: KeysReplaceModel, config: Config, **kwargs: Any 954 ) -> KeysReplaceTransformation: 955 return KeysReplaceTransformation( 956 old=model.old, new=model.new, parameters=model.parameters or {} 957 ) 958 959 def create_flatten_fields( 960 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 961 ) -> FlattenFields: 962 return FlattenFields( 963 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 964 ) 965 966 def create_dpath_flatten_fields( 967 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 968 ) -> DpathFlattenFields: 969 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 970 key_transformation = ( 971 KeyTransformation( 972 config=config, 973 prefix=model.key_transformation.prefix, 974 suffix=model.key_transformation.suffix, 975 parameters=model.parameters or {}, 976 ) 977 if model.key_transformation is not None 978 else None 979 ) 980 return DpathFlattenFields( 981 config=config, 982 field_path=model_field_path, 983 delete_origin_value=model.delete_origin_value 984 if model.delete_origin_value is not None 985 else False, 986 replace_record=model.replace_record if model.replace_record is not None else False, 987 key_transformation=key_transformation, 988 parameters=model.parameters or {}, 989 ) 990 991 @staticmethod 992 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 993 if not value_type: 994 return None 995 names_to_types = { 996 ValueType.string: str, 997 ValueType.number: float, 998 ValueType.integer: int, 999 ValueType.boolean: bool, 1000 } 1001 return names_to_types[value_type] 1002 1003 def create_api_key_authenticator( 1004 self, 1005 model: ApiKeyAuthenticatorModel, 1006 config: Config, 1007 token_provider: Optional[TokenProvider] = None, 1008 **kwargs: Any, 1009 ) -> ApiKeyAuthenticator: 1010 if model.inject_into is None and model.header is None: 1011 raise ValueError( 1012 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1013 ) 1014 1015 if model.inject_into is not None and model.header is not None: 1016 raise ValueError( 1017 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1018 ) 1019 1020 if token_provider is not None and model.api_token != "": 1021 raise ValueError( 1022 "If token_provider is set, api_token is ignored and has to be set to empty string." 1023 ) 1024 1025 request_option = ( 1026 self._create_component_from_model( 1027 model.inject_into, config, parameters=model.parameters or {} 1028 ) 1029 if model.inject_into 1030 else RequestOption( 1031 inject_into=RequestOptionType.header, 1032 field_name=model.header or "", 1033 parameters=model.parameters or {}, 1034 ) 1035 ) 1036 1037 return ApiKeyAuthenticator( 1038 token_provider=( 1039 token_provider 1040 if token_provider is not None 1041 else InterpolatedStringTokenProvider( 1042 api_token=model.api_token or "", 1043 config=config, 1044 parameters=model.parameters or {}, 1045 ) 1046 ), 1047 request_option=request_option, 1048 config=config, 1049 parameters=model.parameters or {}, 1050 ) 1051 1052 def create_legacy_to_per_partition_state_migration( 1053 self, 1054 model: LegacyToPerPartitionStateMigrationModel, 1055 config: Mapping[str, Any], 1056 declarative_stream: DeclarativeStreamModel, 1057 ) -> LegacyToPerPartitionStateMigration: 1058 retriever = declarative_stream.retriever 1059 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1060 raise ValueError( 1061 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1062 ) 1063 partition_router = retriever.partition_router 1064 if not isinstance( 1065 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1066 ): 1067 raise ValueError( 1068 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1069 ) 1070 if not hasattr(partition_router, "parent_stream_configs"): 1071 raise ValueError( 1072 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1073 ) 1074 1075 if not hasattr(declarative_stream, "incremental_sync"): 1076 raise ValueError( 1077 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1078 ) 1079 1080 return LegacyToPerPartitionStateMigration( 1081 partition_router, # type: ignore # was already checked above 1082 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1083 config, 1084 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1085 ) 1086 1087 def create_session_token_authenticator( 1088 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1089 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1090 decoder = ( 1091 self._create_component_from_model(model=model.decoder, config=config) 1092 if model.decoder 1093 else JsonDecoder(parameters={}) 1094 ) 1095 login_requester = self._create_component_from_model( 1096 model=model.login_requester, 1097 config=config, 1098 name=f"{name}_login_requester", 1099 decoder=decoder, 1100 ) 1101 token_provider = SessionTokenProvider( 1102 login_requester=login_requester, 1103 session_token_path=model.session_token_path, 1104 expiration_duration=parse_duration(model.expiration_duration) 1105 if model.expiration_duration 1106 else None, 1107 parameters=model.parameters or {}, 1108 message_repository=self._message_repository, 1109 decoder=decoder, 1110 ) 1111 if model.request_authentication.type == "Bearer": 1112 return ModelToComponentFactory.create_bearer_authenticator( 1113 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1114 config, 1115 token_provider=token_provider, 1116 ) 1117 else: 1118 return self.create_api_key_authenticator( 1119 ApiKeyAuthenticatorModel( 1120 type="ApiKeyAuthenticator", 1121 api_token="", 1122 inject_into=model.request_authentication.inject_into, 1123 ), # type: ignore # $parameters and headers default to None 1124 config=config, 1125 token_provider=token_provider, 1126 ) 1127 1128 @staticmethod 1129 def create_basic_http_authenticator( 1130 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1131 ) -> BasicHttpAuthenticator: 1132 return BasicHttpAuthenticator( 1133 password=model.password or "", 1134 username=model.username, 1135 config=config, 1136 parameters=model.parameters or {}, 1137 ) 1138 1139 @staticmethod 1140 def create_bearer_authenticator( 1141 model: BearerAuthenticatorModel, 1142 config: Config, 1143 token_provider: Optional[TokenProvider] = None, 1144 **kwargs: Any, 1145 ) -> BearerAuthenticator: 1146 if token_provider is not None and model.api_token != "": 1147 raise ValueError( 1148 "If token_provider is set, api_token is ignored and has to be set to empty string." 1149 ) 1150 return BearerAuthenticator( 1151 token_provider=( 1152 token_provider 1153 if token_provider is not None 1154 else InterpolatedStringTokenProvider( 1155 api_token=model.api_token or "", 1156 config=config, 1157 parameters=model.parameters or {}, 1158 ) 1159 ), 1160 config=config, 1161 parameters=model.parameters or {}, 1162 ) 1163 1164 @staticmethod 1165 def create_dynamic_stream_check_config( 1166 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1167 ) -> DynamicStreamCheckConfig: 1168 return DynamicStreamCheckConfig( 1169 dynamic_stream_name=model.dynamic_stream_name, 1170 stream_count=model.stream_count or 0, 1171 ) 1172 1173 def create_check_stream( 1174 self, model: CheckStreamModel, config: Config, **kwargs: Any 1175 ) -> CheckStream: 1176 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1177 raise ValueError( 1178 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1179 ) 1180 1181 dynamic_streams_check_configs = ( 1182 [ 1183 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1184 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1185 ] 1186 if model.dynamic_streams_check_configs 1187 else [] 1188 ) 1189 1190 return CheckStream( 1191 stream_names=model.stream_names or [], 1192 dynamic_streams_check_configs=dynamic_streams_check_configs, 1193 parameters={}, 1194 ) 1195 1196 @staticmethod 1197 def create_check_dynamic_stream( 1198 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1199 ) -> CheckDynamicStream: 1200 assert model.use_check_availability is not None # for mypy 1201 1202 use_check_availability = model.use_check_availability 1203 1204 return CheckDynamicStream( 1205 stream_count=model.stream_count, 1206 use_check_availability=use_check_availability, 1207 parameters={}, 1208 ) 1209 1210 def create_composite_error_handler( 1211 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1212 ) -> CompositeErrorHandler: 1213 error_handlers = [ 1214 self._create_component_from_model(model=error_handler_model, config=config) 1215 for error_handler_model in model.error_handlers 1216 ] 1217 return CompositeErrorHandler( 1218 error_handlers=error_handlers, parameters=model.parameters or {} 1219 ) 1220 1221 @staticmethod 1222 def create_concurrency_level( 1223 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1224 ) -> ConcurrencyLevel: 1225 return ConcurrencyLevel( 1226 default_concurrency=model.default_concurrency, 1227 max_concurrency=model.max_concurrency, 1228 config=config, 1229 parameters={}, 1230 ) 1231 1232 @staticmethod 1233 def apply_stream_state_migrations( 1234 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1235 ) -> MutableMapping[str, Any]: 1236 if stream_state_migrations: 1237 for state_migration in stream_state_migrations: 1238 if state_migration.should_migrate(stream_state): 1239 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1240 stream_state = dict(state_migration.migrate(stream_state)) 1241 return stream_state 1242 1243 def create_concurrent_cursor_from_datetime_based_cursor( 1244 self, 1245 model_type: Type[BaseModel], 1246 component_definition: ComponentDefinition, 1247 stream_name: str, 1248 stream_namespace: Optional[str], 1249 config: Config, 1250 message_repository: Optional[MessageRepository] = None, 1251 runtime_lookback_window: Optional[datetime.timedelta] = None, 1252 stream_state_migrations: Optional[List[Any]] = None, 1253 **kwargs: Any, 1254 ) -> ConcurrentCursor: 1255 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1256 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1257 # incoming state and connector_state_manager that is initialized when the component factory is created 1258 stream_state = ( 1259 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1260 if "stream_state" not in kwargs 1261 else kwargs["stream_state"] 1262 ) 1263 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1264 1265 component_type = component_definition.get("type") 1266 if component_definition.get("type") != model_type.__name__: 1267 raise ValueError( 1268 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1269 ) 1270 1271 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1272 1273 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1274 raise ValueError( 1275 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1276 ) 1277 1278 interpolated_cursor_field = InterpolatedString.create( 1279 datetime_based_cursor_model.cursor_field, 1280 parameters=datetime_based_cursor_model.parameters or {}, 1281 ) 1282 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1283 1284 interpolated_partition_field_start = InterpolatedString.create( 1285 datetime_based_cursor_model.partition_field_start or "start_time", 1286 parameters=datetime_based_cursor_model.parameters or {}, 1287 ) 1288 interpolated_partition_field_end = InterpolatedString.create( 1289 datetime_based_cursor_model.partition_field_end or "end_time", 1290 parameters=datetime_based_cursor_model.parameters or {}, 1291 ) 1292 1293 slice_boundary_fields = ( 1294 interpolated_partition_field_start.eval(config=config), 1295 interpolated_partition_field_end.eval(config=config), 1296 ) 1297 1298 datetime_format = datetime_based_cursor_model.datetime_format 1299 1300 cursor_granularity = ( 1301 parse_duration(datetime_based_cursor_model.cursor_granularity) 1302 if datetime_based_cursor_model.cursor_granularity 1303 else None 1304 ) 1305 1306 lookback_window = None 1307 interpolated_lookback_window = ( 1308 InterpolatedString.create( 1309 datetime_based_cursor_model.lookback_window, 1310 parameters=datetime_based_cursor_model.parameters or {}, 1311 ) 1312 if datetime_based_cursor_model.lookback_window 1313 else None 1314 ) 1315 if interpolated_lookback_window: 1316 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1317 if evaluated_lookback_window: 1318 lookback_window = parse_duration(evaluated_lookback_window) 1319 1320 connector_state_converter: DateTimeStreamStateConverter 1321 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1322 datetime_format=datetime_format, 1323 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1324 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1325 cursor_granularity=cursor_granularity, 1326 ) 1327 1328 # Adjusts the stream state by applying the runtime lookback window. 1329 # This is used to ensure correct state handling in case of failed partitions. 1330 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1331 if runtime_lookback_window and stream_state_value: 1332 new_stream_state = ( 1333 connector_state_converter.parse_timestamp(stream_state_value) 1334 - runtime_lookback_window 1335 ) 1336 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1337 new_stream_state 1338 ) 1339 1340 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1341 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1342 start_date_runtime_value = self.create_min_max_datetime( 1343 model=datetime_based_cursor_model.start_datetime, config=config 1344 ) 1345 else: 1346 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1347 1348 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1349 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1350 end_date_runtime_value = self.create_min_max_datetime( 1351 model=datetime_based_cursor_model.end_datetime, config=config 1352 ) 1353 else: 1354 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1355 1356 interpolated_start_date = MinMaxDatetime.create( 1357 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1358 parameters=datetime_based_cursor_model.parameters, 1359 ) 1360 interpolated_end_date = ( 1361 None 1362 if not end_date_runtime_value 1363 else MinMaxDatetime.create( 1364 end_date_runtime_value, datetime_based_cursor_model.parameters 1365 ) 1366 ) 1367 1368 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1369 if not interpolated_start_date.datetime_format: 1370 interpolated_start_date.datetime_format = datetime_format 1371 if interpolated_end_date and not interpolated_end_date.datetime_format: 1372 interpolated_end_date.datetime_format = datetime_format 1373 1374 start_date = interpolated_start_date.get_datetime(config=config) 1375 end_date_provider = ( 1376 partial(interpolated_end_date.get_datetime, config) 1377 if interpolated_end_date 1378 else connector_state_converter.get_end_provider() 1379 ) 1380 1381 if ( 1382 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1383 ) or ( 1384 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1385 ): 1386 raise ValueError( 1387 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1388 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1389 ) 1390 1391 # When step is not defined, default to a step size from the starting date to the present moment 1392 step_length = datetime.timedelta.max 1393 interpolated_step = ( 1394 InterpolatedString.create( 1395 datetime_based_cursor_model.step, 1396 parameters=datetime_based_cursor_model.parameters or {}, 1397 ) 1398 if datetime_based_cursor_model.step 1399 else None 1400 ) 1401 if interpolated_step: 1402 evaluated_step = interpolated_step.eval(config) 1403 if evaluated_step: 1404 step_length = parse_duration(evaluated_step) 1405 1406 clamping_strategy: ClampingStrategy = NoClamping() 1407 if datetime_based_cursor_model.clamping: 1408 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1409 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1410 # object which we want to keep agnostic of being low-code 1411 target = InterpolatedString( 1412 string=datetime_based_cursor_model.clamping.target, 1413 parameters=datetime_based_cursor_model.parameters or {}, 1414 ) 1415 evaluated_target = target.eval(config=config) 1416 match evaluated_target: 1417 case "DAY": 1418 clamping_strategy = DayClampingStrategy() 1419 end_date_provider = ClampingEndProvider( 1420 DayClampingStrategy(is_ceiling=False), 1421 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1422 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1423 ) 1424 case "WEEK": 1425 if ( 1426 not datetime_based_cursor_model.clamping.target_details 1427 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1428 ): 1429 raise ValueError( 1430 "Given WEEK clamping, weekday needs to be provided as target_details" 1431 ) 1432 weekday = self._assemble_weekday( 1433 datetime_based_cursor_model.clamping.target_details["weekday"] 1434 ) 1435 clamping_strategy = WeekClampingStrategy(weekday) 1436 end_date_provider = ClampingEndProvider( 1437 WeekClampingStrategy(weekday, is_ceiling=False), 1438 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1439 granularity=cursor_granularity or datetime.timedelta(days=1), 1440 ) 1441 case "MONTH": 1442 clamping_strategy = MonthClampingStrategy() 1443 end_date_provider = ClampingEndProvider( 1444 MonthClampingStrategy(is_ceiling=False), 1445 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1446 granularity=cursor_granularity or datetime.timedelta(days=1), 1447 ) 1448 case _: 1449 raise ValueError( 1450 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1451 ) 1452 1453 return ConcurrentCursor( 1454 stream_name=stream_name, 1455 stream_namespace=stream_namespace, 1456 stream_state=stream_state, 1457 message_repository=message_repository or self._message_repository, 1458 connector_state_manager=self._connector_state_manager, 1459 connector_state_converter=connector_state_converter, 1460 cursor_field=cursor_field, 1461 slice_boundary_fields=slice_boundary_fields, 1462 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1463 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1464 lookback_window=lookback_window, 1465 slice_range=step_length, 1466 cursor_granularity=cursor_granularity, 1467 clamping_strategy=clamping_strategy, 1468 ) 1469 1470 def create_concurrent_cursor_from_incrementing_count_cursor( 1471 self, 1472 model_type: Type[BaseModel], 1473 component_definition: ComponentDefinition, 1474 stream_name: str, 1475 stream_namespace: Optional[str], 1476 config: Config, 1477 message_repository: Optional[MessageRepository] = None, 1478 **kwargs: Any, 1479 ) -> ConcurrentCursor: 1480 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1481 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1482 # incoming state and connector_state_manager that is initialized when the component factory is created 1483 stream_state = ( 1484 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1485 if "stream_state" not in kwargs 1486 else kwargs["stream_state"] 1487 ) 1488 1489 component_type = component_definition.get("type") 1490 if component_definition.get("type") != model_type.__name__: 1491 raise ValueError( 1492 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1493 ) 1494 1495 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1496 1497 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1498 raise ValueError( 1499 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1500 ) 1501 1502 interpolated_start_value = ( 1503 InterpolatedString.create( 1504 incrementing_count_cursor_model.start_value, # type: ignore 1505 parameters=incrementing_count_cursor_model.parameters or {}, 1506 ) 1507 if incrementing_count_cursor_model.start_value 1508 else 0 1509 ) 1510 1511 interpolated_cursor_field = InterpolatedString.create( 1512 incrementing_count_cursor_model.cursor_field, 1513 parameters=incrementing_count_cursor_model.parameters or {}, 1514 ) 1515 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1516 1517 connector_state_converter = IncrementingCountStreamStateConverter( 1518 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1519 ) 1520 1521 return ConcurrentCursor( 1522 stream_name=stream_name, 1523 stream_namespace=stream_namespace, 1524 stream_state=stream_state, 1525 message_repository=message_repository or self._message_repository, 1526 connector_state_manager=self._connector_state_manager, 1527 connector_state_converter=connector_state_converter, 1528 cursor_field=cursor_field, 1529 slice_boundary_fields=None, 1530 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1531 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1532 ) 1533 1534 def _assemble_weekday(self, weekday: str) -> Weekday: 1535 match weekday: 1536 case "MONDAY": 1537 return Weekday.MONDAY 1538 case "TUESDAY": 1539 return Weekday.TUESDAY 1540 case "WEDNESDAY": 1541 return Weekday.WEDNESDAY 1542 case "THURSDAY": 1543 return Weekday.THURSDAY 1544 case "FRIDAY": 1545 return Weekday.FRIDAY 1546 case "SATURDAY": 1547 return Weekday.SATURDAY 1548 case "SUNDAY": 1549 return Weekday.SUNDAY 1550 case _: 1551 raise ValueError(f"Unknown weekday {weekday}") 1552 1553 def create_concurrent_cursor_from_perpartition_cursor( 1554 self, 1555 state_manager: ConnectorStateManager, 1556 model_type: Type[BaseModel], 1557 component_definition: ComponentDefinition, 1558 stream_name: str, 1559 stream_namespace: Optional[str], 1560 config: Config, 1561 stream_state: MutableMapping[str, Any], 1562 partition_router: PartitionRouter, 1563 stream_state_migrations: Optional[List[Any]] = None, 1564 **kwargs: Any, 1565 ) -> ConcurrentPerPartitionCursor: 1566 component_type = component_definition.get("type") 1567 if component_definition.get("type") != model_type.__name__: 1568 raise ValueError( 1569 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1570 ) 1571 1572 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1573 1574 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1575 raise ValueError( 1576 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1577 ) 1578 1579 interpolated_cursor_field = InterpolatedString.create( 1580 datetime_based_cursor_model.cursor_field, 1581 parameters=datetime_based_cursor_model.parameters or {}, 1582 ) 1583 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1584 1585 datetime_format = datetime_based_cursor_model.datetime_format 1586 1587 cursor_granularity = ( 1588 parse_duration(datetime_based_cursor_model.cursor_granularity) 1589 if datetime_based_cursor_model.cursor_granularity 1590 else None 1591 ) 1592 1593 connector_state_converter: DateTimeStreamStateConverter 1594 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1595 datetime_format=datetime_format, 1596 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1597 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1598 cursor_granularity=cursor_granularity, 1599 ) 1600 1601 # Create the cursor factory 1602 cursor_factory = ConcurrentCursorFactory( 1603 partial( 1604 self.create_concurrent_cursor_from_datetime_based_cursor, 1605 state_manager=state_manager, 1606 model_type=model_type, 1607 component_definition=component_definition, 1608 stream_name=stream_name, 1609 stream_namespace=stream_namespace, 1610 config=config, 1611 message_repository=NoopMessageRepository(), 1612 stream_state_migrations=stream_state_migrations, 1613 ) 1614 ) 1615 1616 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1617 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1618 use_global_cursor = isinstance( 1619 partition_router, GroupingPartitionRouter 1620 ) or component_definition.get("global_substream_cursor", False) 1621 1622 # Return the concurrent cursor and state converter 1623 return ConcurrentPerPartitionCursor( 1624 cursor_factory=cursor_factory, 1625 partition_router=partition_router, 1626 stream_name=stream_name, 1627 stream_namespace=stream_namespace, 1628 stream_state=stream_state, 1629 message_repository=self._message_repository, # type: ignore 1630 connector_state_manager=state_manager, 1631 connector_state_converter=connector_state_converter, 1632 cursor_field=cursor_field, 1633 use_global_cursor=use_global_cursor, 1634 ) 1635 1636 @staticmethod 1637 def create_constant_backoff_strategy( 1638 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1639 ) -> ConstantBackoffStrategy: 1640 return ConstantBackoffStrategy( 1641 backoff_time_in_seconds=model.backoff_time_in_seconds, 1642 config=config, 1643 parameters=model.parameters or {}, 1644 ) 1645 1646 def create_cursor_pagination( 1647 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1648 ) -> CursorPaginationStrategy: 1649 if isinstance(decoder, PaginationDecoderDecorator): 1650 inner_decoder = decoder.decoder 1651 else: 1652 inner_decoder = decoder 1653 decoder = PaginationDecoderDecorator(decoder=decoder) 1654 1655 if self._is_supported_decoder_for_pagination(inner_decoder): 1656 decoder_to_use = decoder 1657 else: 1658 raise ValueError( 1659 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1660 ) 1661 1662 return CursorPaginationStrategy( 1663 cursor_value=model.cursor_value, 1664 decoder=decoder_to_use, 1665 page_size=model.page_size, 1666 stop_condition=model.stop_condition, 1667 config=config, 1668 parameters=model.parameters or {}, 1669 ) 1670 1671 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1672 """ 1673 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1674 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1675 :param model: The Pydantic model of the custom component being created 1676 :param config: The custom defined connector config 1677 :return: The declarative component built from the Pydantic model to be used at runtime 1678 """ 1679 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1680 component_fields = get_type_hints(custom_component_class) 1681 model_args = model.dict() 1682 model_args["config"] = config 1683 1684 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1685 # we defer to these arguments over the component's definition 1686 for key, arg in kwargs.items(): 1687 model_args[key] = arg 1688 1689 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1690 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1691 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1692 for model_field, model_value in model_args.items(): 1693 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1694 if ( 1695 isinstance(model_value, dict) 1696 and "type" not in model_value 1697 and model_field in component_fields 1698 ): 1699 derived_type = self._derive_component_type_from_type_hints( 1700 component_fields.get(model_field) 1701 ) 1702 if derived_type: 1703 model_value["type"] = derived_type 1704 1705 if self._is_component(model_value): 1706 model_args[model_field] = self._create_nested_component( 1707 model, model_field, model_value, config 1708 ) 1709 elif isinstance(model_value, list): 1710 vals = [] 1711 for v in model_value: 1712 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1713 derived_type = self._derive_component_type_from_type_hints( 1714 component_fields.get(model_field) 1715 ) 1716 if derived_type: 1717 v["type"] = derived_type 1718 if self._is_component(v): 1719 vals.append(self._create_nested_component(model, model_field, v, config)) 1720 else: 1721 vals.append(v) 1722 model_args[model_field] = vals 1723 1724 kwargs = { 1725 class_field: model_args[class_field] 1726 for class_field in component_fields.keys() 1727 if class_field in model_args 1728 } 1729 return custom_component_class(**kwargs) 1730 1731 @staticmethod 1732 def _get_class_from_fully_qualified_class_name( 1733 full_qualified_class_name: str, 1734 ) -> Any: 1735 """Get a class from its fully qualified name. 1736 1737 If a custom components module is needed, we assume it is already registered - probably 1738 as `source_declarative_manifest.components` or `components`. 1739 1740 Args: 1741 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1742 1743 Returns: 1744 Any: The class object. 1745 1746 Raises: 1747 ValueError: If the class cannot be loaded. 1748 """ 1749 split = full_qualified_class_name.split(".") 1750 module_name_full = ".".join(split[:-1]) 1751 class_name = split[-1] 1752 1753 try: 1754 module_ref = importlib.import_module(module_name_full) 1755 except ModuleNotFoundError as e: 1756 if split[0] == "source_declarative_manifest": 1757 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1758 try: 1759 import os 1760 1761 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1762 module_ref = importlib.import_module( 1763 module_name_with_source_declarative_manifest 1764 ) 1765 except ModuleNotFoundError: 1766 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1767 else: 1768 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1769 1770 try: 1771 return getattr(module_ref, class_name) 1772 except AttributeError as e: 1773 raise ValueError( 1774 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1775 ) from e 1776 1777 @staticmethod 1778 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1779 interface = field_type 1780 while True: 1781 origin = get_origin(interface) 1782 if origin: 1783 # Unnest types until we reach the raw type 1784 # List[T] -> T 1785 # Optional[List[T]] -> T 1786 args = get_args(interface) 1787 interface = args[0] 1788 else: 1789 break 1790 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1791 return interface.__name__ 1792 return None 1793 1794 @staticmethod 1795 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1796 if not cls: 1797 return False 1798 return cls.__module__ == "builtins" 1799 1800 @staticmethod 1801 def _extract_missing_parameters(error: TypeError) -> List[str]: 1802 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1803 if parameter_search: 1804 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1805 else: 1806 return [] 1807 1808 def _create_nested_component( 1809 self, model: Any, model_field: str, model_value: Any, config: Config 1810 ) -> Any: 1811 type_name = model_value.get("type", None) 1812 if not type_name: 1813 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1814 return model_value 1815 1816 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1817 if model_type: 1818 parsed_model = model_type.parse_obj(model_value) 1819 try: 1820 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1821 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1822 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1823 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1824 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1825 # are needed by a component and could not be shared. 1826 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1827 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1828 model_parameters = model_value.get("$parameters", {}) 1829 matching_parameters = { 1830 kwarg: model_parameters[kwarg] 1831 for kwarg in constructor_kwargs 1832 if kwarg in model_parameters 1833 } 1834 return self._create_component_from_model( 1835 model=parsed_model, config=config, **matching_parameters 1836 ) 1837 except TypeError as error: 1838 missing_parameters = self._extract_missing_parameters(error) 1839 if missing_parameters: 1840 raise ValueError( 1841 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1842 + ", ".join( 1843 ( 1844 f"{type_name}.$parameters.{parameter}" 1845 for parameter in missing_parameters 1846 ) 1847 ) 1848 ) 1849 raise TypeError( 1850 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1851 ) 1852 else: 1853 raise ValueError( 1854 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1855 ) 1856 1857 @staticmethod 1858 def _is_component(model_value: Any) -> bool: 1859 return isinstance(model_value, dict) and model_value.get("type") is not None 1860 1861 def create_datetime_based_cursor( 1862 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1863 ) -> DatetimeBasedCursor: 1864 start_datetime: Union[str, MinMaxDatetime] = ( 1865 model.start_datetime 1866 if isinstance(model.start_datetime, str) 1867 else self.create_min_max_datetime(model.start_datetime, config) 1868 ) 1869 end_datetime: Union[str, MinMaxDatetime, None] = None 1870 if model.is_data_feed and model.end_datetime: 1871 raise ValueError("Data feed does not support end_datetime") 1872 if model.is_data_feed and model.is_client_side_incremental: 1873 raise ValueError( 1874 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1875 ) 1876 if model.end_datetime: 1877 end_datetime = ( 1878 model.end_datetime 1879 if isinstance(model.end_datetime, str) 1880 else self.create_min_max_datetime(model.end_datetime, config) 1881 ) 1882 1883 end_time_option = ( 1884 self._create_component_from_model( 1885 model.end_time_option, config, parameters=model.parameters or {} 1886 ) 1887 if model.end_time_option 1888 else None 1889 ) 1890 start_time_option = ( 1891 self._create_component_from_model( 1892 model.start_time_option, config, parameters=model.parameters or {} 1893 ) 1894 if model.start_time_option 1895 else None 1896 ) 1897 1898 return DatetimeBasedCursor( 1899 cursor_field=model.cursor_field, 1900 cursor_datetime_formats=model.cursor_datetime_formats 1901 if model.cursor_datetime_formats 1902 else [], 1903 cursor_granularity=model.cursor_granularity, 1904 datetime_format=model.datetime_format, 1905 end_datetime=end_datetime, 1906 start_datetime=start_datetime, 1907 step=model.step, 1908 end_time_option=end_time_option, 1909 lookback_window=model.lookback_window, 1910 start_time_option=start_time_option, 1911 partition_field_end=model.partition_field_end, 1912 partition_field_start=model.partition_field_start, 1913 message_repository=self._message_repository, 1914 is_compare_strictly=model.is_compare_strictly, 1915 config=config, 1916 parameters=model.parameters or {}, 1917 ) 1918 1919 def create_declarative_stream( 1920 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1921 ) -> DeclarativeStream: 1922 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1923 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1924 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1925 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1926 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1927 1928 primary_key = model.primary_key.__root__ if model.primary_key else None 1929 stop_condition_on_cursor = ( 1930 model.incremental_sync 1931 and hasattr(model.incremental_sync, "is_data_feed") 1932 and model.incremental_sync.is_data_feed 1933 ) 1934 client_side_incremental_sync = None 1935 if ( 1936 model.incremental_sync 1937 and hasattr(model.incremental_sync, "is_client_side_incremental") 1938 and model.incremental_sync.is_client_side_incremental 1939 ): 1940 supported_slicers = ( 1941 DatetimeBasedCursor, 1942 GlobalSubstreamCursor, 1943 PerPartitionWithGlobalCursor, 1944 ) 1945 if combined_slicers and not isinstance(combined_slicers, supported_slicers): 1946 raise ValueError( 1947 "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead" 1948 ) 1949 cursor = ( 1950 combined_slicers 1951 if isinstance( 1952 combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor) 1953 ) 1954 else self._create_component_from_model(model=model.incremental_sync, config=config) 1955 ) 1956 1957 client_side_incremental_sync = {"cursor": cursor} 1958 1959 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1960 cursor_model = model.incremental_sync 1961 1962 end_time_option = ( 1963 self._create_component_from_model( 1964 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1965 ) 1966 if cursor_model.end_time_option 1967 else None 1968 ) 1969 start_time_option = ( 1970 self._create_component_from_model( 1971 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1972 ) 1973 if cursor_model.start_time_option 1974 else None 1975 ) 1976 1977 request_options_provider = DatetimeBasedRequestOptionsProvider( 1978 start_time_option=start_time_option, 1979 end_time_option=end_time_option, 1980 partition_field_start=cursor_model.partition_field_end, 1981 partition_field_end=cursor_model.partition_field_end, 1982 config=config, 1983 parameters=model.parameters or {}, 1984 ) 1985 elif model.incremental_sync and isinstance( 1986 model.incremental_sync, IncrementingCountCursorModel 1987 ): 1988 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1989 1990 start_time_option = ( 1991 self._create_component_from_model( 1992 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1993 config, 1994 parameters=cursor_model.parameters or {}, 1995 ) 1996 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1997 else None 1998 ) 1999 2000 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2001 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2002 partition_field_start = "start" 2003 2004 request_options_provider = DatetimeBasedRequestOptionsProvider( 2005 start_time_option=start_time_option, 2006 partition_field_start=partition_field_start, 2007 config=config, 2008 parameters=model.parameters or {}, 2009 ) 2010 else: 2011 request_options_provider = None 2012 2013 transformations = [] 2014 if model.transformations: 2015 for transformation_model in model.transformations: 2016 transformations.append( 2017 self._create_component_from_model(model=transformation_model, config=config) 2018 ) 2019 file_uploader = None 2020 if model.file_uploader: 2021 file_uploader = self._create_component_from_model( 2022 model=model.file_uploader, config=config 2023 ) 2024 2025 retriever = self._create_component_from_model( 2026 model=model.retriever, 2027 config=config, 2028 name=model.name, 2029 primary_key=primary_key, 2030 stream_slicer=combined_slicers, 2031 request_options_provider=request_options_provider, 2032 stop_condition_on_cursor=stop_condition_on_cursor, 2033 client_side_incremental_sync=client_side_incremental_sync, 2034 transformations=transformations, 2035 file_uploader=file_uploader, 2036 incremental_sync=model.incremental_sync, 2037 ) 2038 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 2039 2040 if model.state_migrations: 2041 state_transformations = [ 2042 self._create_component_from_model(state_migration, config, declarative_stream=model) 2043 for state_migration in model.state_migrations 2044 ] 2045 else: 2046 state_transformations = [] 2047 2048 schema_loader: Union[ 2049 CompositeSchemaLoader, 2050 DefaultSchemaLoader, 2051 DynamicSchemaLoader, 2052 InlineSchemaLoader, 2053 JsonFileSchemaLoader, 2054 ] 2055 if model.schema_loader and isinstance(model.schema_loader, list): 2056 nested_schema_loaders = [ 2057 self._create_component_from_model(model=nested_schema_loader, config=config) 2058 for nested_schema_loader in model.schema_loader 2059 ] 2060 schema_loader = CompositeSchemaLoader( 2061 schema_loaders=nested_schema_loaders, parameters={} 2062 ) 2063 elif model.schema_loader: 2064 schema_loader = self._create_component_from_model( 2065 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2066 config=config, 2067 ) 2068 else: 2069 options = model.parameters or {} 2070 if "name" not in options: 2071 options["name"] = model.name 2072 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2073 2074 return DeclarativeStream( 2075 name=model.name or "", 2076 primary_key=primary_key, 2077 retriever=retriever, 2078 schema_loader=schema_loader, 2079 stream_cursor_field=cursor_field or "", 2080 state_migrations=state_transformations, 2081 config=config, 2082 parameters=model.parameters or {}, 2083 ) 2084 2085 def _build_stream_slicer_from_partition_router( 2086 self, 2087 model: Union[ 2088 AsyncRetrieverModel, 2089 CustomRetrieverModel, 2090 SimpleRetrieverModel, 2091 ], 2092 config: Config, 2093 stream_name: Optional[str] = None, 2094 ) -> Optional[PartitionRouter]: 2095 if ( 2096 hasattr(model, "partition_router") 2097 and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel) 2098 and model.partition_router 2099 ): 2100 stream_slicer_model = model.partition_router 2101 if isinstance(stream_slicer_model, list): 2102 return CartesianProductStreamSlicer( 2103 [ 2104 self._create_component_from_model( 2105 model=slicer, config=config, stream_name=stream_name or "" 2106 ) 2107 for slicer in stream_slicer_model 2108 ], 2109 parameters={}, 2110 ) 2111 else: 2112 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2113 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2114 ) 2115 return None 2116 2117 def _build_incremental_cursor( 2118 self, 2119 model: DeclarativeStreamModel, 2120 stream_slicer: Optional[PartitionRouter], 2121 config: Config, 2122 ) -> Optional[StreamSlicer]: 2123 if model.incremental_sync and stream_slicer: 2124 if model.retriever.type == "AsyncRetriever": 2125 stream_name = model.name or "" 2126 stream_namespace = None 2127 stream_state = self._connector_state_manager.get_stream_state( 2128 stream_name, stream_namespace 2129 ) 2130 state_transformations = ( 2131 [ 2132 self._create_component_from_model( 2133 state_migration, config, declarative_stream=model 2134 ) 2135 for state_migration in model.state_migrations 2136 ] 2137 if model.state_migrations 2138 else [] 2139 ) 2140 2141 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2142 state_manager=self._connector_state_manager, 2143 model_type=DatetimeBasedCursorModel, 2144 component_definition=model.incremental_sync.__dict__, 2145 stream_name=stream_name, 2146 stream_namespace=stream_namespace, 2147 config=config or {}, 2148 stream_state=stream_state, 2149 stream_state_migrations=state_transformations, 2150 partition_router=stream_slicer, 2151 ) 2152 2153 incremental_sync_model = model.incremental_sync 2154 cursor_component = self._create_component_from_model( 2155 model=incremental_sync_model, config=config 2156 ) 2157 is_global_cursor = ( 2158 hasattr(incremental_sync_model, "global_substream_cursor") 2159 and incremental_sync_model.global_substream_cursor 2160 ) 2161 2162 if is_global_cursor: 2163 return GlobalSubstreamCursor( 2164 stream_cursor=cursor_component, partition_router=stream_slicer 2165 ) 2166 return PerPartitionWithGlobalCursor( 2167 cursor_factory=CursorFactory( 2168 lambda: self._create_component_from_model( 2169 model=incremental_sync_model, config=config 2170 ), 2171 ), 2172 partition_router=stream_slicer, 2173 stream_cursor=cursor_component, 2174 ) 2175 elif model.incremental_sync: 2176 if model.retriever.type == "AsyncRetriever": 2177 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2178 model_type=DatetimeBasedCursorModel, 2179 component_definition=model.incremental_sync.__dict__, 2180 stream_name=model.name or "", 2181 stream_namespace=None, 2182 config=config or {}, 2183 stream_state_migrations=model.state_migrations, 2184 ) 2185 return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync 2186 return None 2187 2188 def _build_resumable_cursor( 2189 self, 2190 model: Union[ 2191 AsyncRetrieverModel, 2192 CustomRetrieverModel, 2193 SimpleRetrieverModel, 2194 ], 2195 stream_slicer: Optional[PartitionRouter], 2196 ) -> Optional[StreamSlicer]: 2197 if hasattr(model, "paginator") and model.paginator and not stream_slicer: 2198 # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` 2199 return ResumableFullRefreshCursor(parameters={}) 2200 elif stream_slicer: 2201 # For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor` 2202 return PerPartitionCursor( 2203 cursor_factory=CursorFactory( 2204 create_function=partial(ChildPartitionResumableFullRefreshCursor, {}) 2205 ), 2206 partition_router=stream_slicer, 2207 ) 2208 return None 2209 2210 def _merge_stream_slicers( 2211 self, model: DeclarativeStreamModel, config: Config 2212 ) -> Optional[StreamSlicer]: 2213 retriever_model = model.retriever 2214 2215 stream_slicer = self._build_stream_slicer_from_partition_router( 2216 retriever_model, config, stream_name=model.name 2217 ) 2218 2219 if retriever_model.type == "AsyncRetriever": 2220 is_not_datetime_cursor = ( 2221 model.incremental_sync.type != "DatetimeBasedCursor" 2222 if model.incremental_sync 2223 else None 2224 ) 2225 is_partition_router = ( 2226 bool(retriever_model.partition_router) if model.incremental_sync else None 2227 ) 2228 2229 if is_not_datetime_cursor: 2230 # We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the 2231 # support or unordered slices (for example, when we trigger reports for January and February, the report 2232 # in February can be completed first). Once we have support for custom concurrent cursor or have a new 2233 # implementation available in the CDK, we can enable more cursors here. 2234 raise ValueError( 2235 "AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet." 2236 ) 2237 2238 if is_partition_router and not stream_slicer: 2239 # Note that this development is also done in parallel to the per partition development which once merged 2240 # we could support here by calling create_concurrent_cursor_from_perpartition_cursor 2241 raise ValueError("Per partition state is not supported yet for AsyncRetriever.") 2242 2243 if model.incremental_sync: 2244 return self._build_incremental_cursor(model, stream_slicer, config) 2245 2246 return ( 2247 stream_slicer 2248 if self._disable_resumable_full_refresh 2249 else self._build_resumable_cursor(retriever_model, stream_slicer) 2250 ) 2251 2252 def create_default_error_handler( 2253 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2254 ) -> DefaultErrorHandler: 2255 backoff_strategies = [] 2256 if model.backoff_strategies: 2257 for backoff_strategy_model in model.backoff_strategies: 2258 backoff_strategies.append( 2259 self._create_component_from_model(model=backoff_strategy_model, config=config) 2260 ) 2261 2262 response_filters = [] 2263 if model.response_filters: 2264 for response_filter_model in model.response_filters: 2265 response_filters.append( 2266 self._create_component_from_model(model=response_filter_model, config=config) 2267 ) 2268 response_filters.append( 2269 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2270 ) 2271 2272 return DefaultErrorHandler( 2273 backoff_strategies=backoff_strategies, 2274 max_retries=model.max_retries, 2275 response_filters=response_filters, 2276 config=config, 2277 parameters=model.parameters or {}, 2278 ) 2279 2280 def create_default_paginator( 2281 self, 2282 model: DefaultPaginatorModel, 2283 config: Config, 2284 *, 2285 url_base: str, 2286 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2287 decoder: Optional[Decoder] = None, 2288 cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None, 2289 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2290 if decoder: 2291 if self._is_supported_decoder_for_pagination(decoder): 2292 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2293 else: 2294 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2295 else: 2296 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2297 page_size_option = ( 2298 self._create_component_from_model(model=model.page_size_option, config=config) 2299 if model.page_size_option 2300 else None 2301 ) 2302 page_token_option = ( 2303 self._create_component_from_model(model=model.page_token_option, config=config) 2304 if model.page_token_option 2305 else None 2306 ) 2307 pagination_strategy = self._create_component_from_model( 2308 model=model.pagination_strategy, 2309 config=config, 2310 decoder=decoder_to_use, 2311 extractor_model=extractor_model, 2312 ) 2313 if cursor_used_for_stop_condition: 2314 pagination_strategy = StopConditionPaginationStrategyDecorator( 2315 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2316 ) 2317 paginator = DefaultPaginator( 2318 decoder=decoder_to_use, 2319 page_size_option=page_size_option, 2320 page_token_option=page_token_option, 2321 pagination_strategy=pagination_strategy, 2322 url_base=url_base, 2323 config=config, 2324 parameters=model.parameters or {}, 2325 ) 2326 if self._limit_pages_fetched_per_slice: 2327 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2328 return paginator 2329 2330 def create_dpath_extractor( 2331 self, 2332 model: DpathExtractorModel, 2333 config: Config, 2334 decoder: Optional[Decoder] = None, 2335 **kwargs: Any, 2336 ) -> DpathExtractor: 2337 if decoder: 2338 decoder_to_use = decoder 2339 else: 2340 decoder_to_use = JsonDecoder(parameters={}) 2341 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2342 return DpathExtractor( 2343 decoder=decoder_to_use, 2344 field_path=model_field_path, 2345 config=config, 2346 parameters=model.parameters or {}, 2347 ) 2348 2349 @staticmethod 2350 def create_response_to_file_extractor( 2351 model: ResponseToFileExtractorModel, 2352 **kwargs: Any, 2353 ) -> ResponseToFileExtractor: 2354 return ResponseToFileExtractor(parameters=model.parameters or {}) 2355 2356 @staticmethod 2357 def create_exponential_backoff_strategy( 2358 model: ExponentialBackoffStrategyModel, config: Config 2359 ) -> ExponentialBackoffStrategy: 2360 return ExponentialBackoffStrategy( 2361 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2362 ) 2363 2364 @staticmethod 2365 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2366 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2367 2368 def create_http_requester( 2369 self, 2370 model: HttpRequesterModel, 2371 config: Config, 2372 decoder: Decoder = JsonDecoder(parameters={}), 2373 query_properties_key: Optional[str] = None, 2374 use_cache: Optional[bool] = None, 2375 *, 2376 name: str, 2377 ) -> HttpRequester: 2378 authenticator = ( 2379 self._create_component_from_model( 2380 model=model.authenticator, 2381 config=config, 2382 url_base=model.url or model.url_base, 2383 name=name, 2384 decoder=decoder, 2385 ) 2386 if model.authenticator 2387 else None 2388 ) 2389 error_handler = ( 2390 self._create_component_from_model(model=model.error_handler, config=config) 2391 if model.error_handler 2392 else DefaultErrorHandler( 2393 backoff_strategies=[], 2394 response_filters=[], 2395 config=config, 2396 parameters=model.parameters or {}, 2397 ) 2398 ) 2399 2400 api_budget = self._api_budget 2401 2402 # Removes QueryProperties components from the interpolated mappings because it has been designed 2403 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2404 # instead of through jinja interpolation 2405 request_parameters: Optional[Union[str, Mapping[str, str]]] 2406 if isinstance(model.request_parameters, Mapping): 2407 request_parameters = self._remove_query_properties(model.request_parameters) 2408 else: 2409 request_parameters = model.request_parameters 2410 2411 request_options_provider = InterpolatedRequestOptionsProvider( 2412 request_body=model.request_body, 2413 request_body_data=model.request_body_data, 2414 request_body_json=model.request_body_json, 2415 request_headers=model.request_headers, 2416 request_parameters=request_parameters, 2417 query_properties_key=query_properties_key, 2418 config=config, 2419 parameters=model.parameters or {}, 2420 ) 2421 2422 assert model.use_cache is not None # for mypy 2423 assert model.http_method is not None # for mypy 2424 2425 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2426 2427 return HttpRequester( 2428 name=name, 2429 url=model.url, 2430 url_base=model.url_base, 2431 path=model.path, 2432 authenticator=authenticator, 2433 error_handler=error_handler, 2434 api_budget=api_budget, 2435 http_method=HttpMethod[model.http_method.value], 2436 request_options_provider=request_options_provider, 2437 config=config, 2438 disable_retries=self._disable_retries, 2439 parameters=model.parameters or {}, 2440 message_repository=self._message_repository, 2441 use_cache=should_use_cache, 2442 decoder=decoder, 2443 stream_response=decoder.is_stream_response() if decoder else False, 2444 ) 2445 2446 @staticmethod 2447 def create_http_response_filter( 2448 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2449 ) -> HttpResponseFilter: 2450 if model.action: 2451 action = ResponseAction(model.action.value) 2452 else: 2453 action = None 2454 2455 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2456 2457 http_codes = ( 2458 set(model.http_codes) if model.http_codes else set() 2459 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2460 2461 return HttpResponseFilter( 2462 action=action, 2463 failure_type=failure_type, 2464 error_message=model.error_message or "", 2465 error_message_contains=model.error_message_contains or "", 2466 http_codes=http_codes, 2467 predicate=model.predicate or "", 2468 config=config, 2469 parameters=model.parameters or {}, 2470 ) 2471 2472 @staticmethod 2473 def create_inline_schema_loader( 2474 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2475 ) -> InlineSchemaLoader: 2476 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2477 2478 def create_complex_field_type( 2479 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2480 ) -> ComplexFieldType: 2481 items = ( 2482 self._create_component_from_model(model=model.items, config=config) 2483 if isinstance(model.items, ComplexFieldTypeModel) 2484 else model.items 2485 ) 2486 2487 return ComplexFieldType(field_type=model.field_type, items=items) 2488 2489 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2490 target_type = ( 2491 self._create_component_from_model(model=model.target_type, config=config) 2492 if isinstance(model.target_type, ComplexFieldTypeModel) 2493 else model.target_type 2494 ) 2495 2496 return TypesMap( 2497 target_type=target_type, 2498 current_type=model.current_type, 2499 condition=model.condition if model.condition is not None else "True", 2500 ) 2501 2502 def create_schema_type_identifier( 2503 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2504 ) -> SchemaTypeIdentifier: 2505 types_mapping = [] 2506 if model.types_mapping: 2507 types_mapping.extend( 2508 [ 2509 self._create_component_from_model(types_map, config=config) 2510 for types_map in model.types_mapping 2511 ] 2512 ) 2513 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2514 [x for x in model.schema_pointer] if model.schema_pointer else [] 2515 ) 2516 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2517 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2518 [x for x in model.type_pointer] if model.type_pointer else None 2519 ) 2520 2521 return SchemaTypeIdentifier( 2522 schema_pointer=model_schema_pointer, 2523 key_pointer=model_key_pointer, 2524 type_pointer=model_type_pointer, 2525 types_mapping=types_mapping, 2526 parameters=model.parameters or {}, 2527 ) 2528 2529 def create_dynamic_schema_loader( 2530 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2531 ) -> DynamicSchemaLoader: 2532 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2533 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2534 2535 schema_transformations = [] 2536 if model.schema_transformations: 2537 for transformation_model in model.schema_transformations: 2538 schema_transformations.append( 2539 self._create_component_from_model(model=transformation_model, config=config) 2540 ) 2541 name = "dynamic_properties" 2542 retriever = self._create_component_from_model( 2543 model=model.retriever, 2544 config=config, 2545 name=name, 2546 primary_key=None, 2547 stream_slicer=combined_slicers, 2548 transformations=[], 2549 use_cache=True, 2550 log_formatter=( 2551 lambda response: format_http_message( 2552 response, 2553 f"Schema loader '{name}' request", 2554 f"Request performed in order to extract schema.", 2555 name, 2556 is_auxiliary=True, 2557 ) 2558 ), 2559 ) 2560 schema_type_identifier = self._create_component_from_model( 2561 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2562 ) 2563 schema_filter = ( 2564 self._create_component_from_model( 2565 model.schema_filter, config=config, parameters=model.parameters or {} 2566 ) 2567 if model.schema_filter is not None 2568 else None 2569 ) 2570 2571 return DynamicSchemaLoader( 2572 retriever=retriever, 2573 config=config, 2574 schema_transformations=schema_transformations, 2575 schema_filter=schema_filter, 2576 schema_type_identifier=schema_type_identifier, 2577 parameters=model.parameters or {}, 2578 ) 2579 2580 @staticmethod 2581 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2582 return JsonDecoder(parameters={}) 2583 2584 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2585 return CompositeRawDecoder( 2586 parser=ModelToComponentFactory._get_parser(model, config), 2587 stream_response=False if self._emit_connector_builder_messages else True, 2588 ) 2589 2590 def create_jsonl_decoder( 2591 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2592 ) -> Decoder: 2593 return CompositeRawDecoder( 2594 parser=ModelToComponentFactory._get_parser(model, config), 2595 stream_response=False if self._emit_connector_builder_messages else True, 2596 ) 2597 2598 def create_gzip_decoder( 2599 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2600 ) -> Decoder: 2601 _compressed_response_types = { 2602 "gzip", 2603 "x-gzip", 2604 "gzip, deflate", 2605 "x-gzip, deflate", 2606 "application/zip", 2607 "application/gzip", 2608 "application/x-gzip", 2609 "application/x-zip-compressed", 2610 } 2611 2612 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2613 2614 if self._emit_connector_builder_messages: 2615 # This is very surprising but if the response is not streamed, 2616 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2617 # which uses urllib3 directly and does not uncompress the data. 2618 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2619 2620 return CompositeRawDecoder.by_headers( 2621 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2622 stream_response=True, 2623 fallback_parser=gzip_parser.inner_parser, 2624 ) 2625 2626 @staticmethod 2627 def create_incrementing_count_cursor( 2628 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2629 ) -> DatetimeBasedCursor: 2630 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2631 # we still parse models into components. The issue is that there's no runtime implementation of a 2632 # IncrementingCountCursor. 2633 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2634 return DatetimeBasedCursor( 2635 cursor_field=model.cursor_field, 2636 datetime_format="%Y-%m-%d", 2637 start_datetime="2024-12-12", 2638 config=config, 2639 parameters={}, 2640 ) 2641 2642 @staticmethod 2643 def create_iterable_decoder( 2644 model: IterableDecoderModel, config: Config, **kwargs: Any 2645 ) -> IterableDecoder: 2646 return IterableDecoder(parameters={}) 2647 2648 @staticmethod 2649 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2650 return XmlDecoder(parameters={}) 2651 2652 def create_zipfile_decoder( 2653 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2654 ) -> ZipfileDecoder: 2655 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2656 2657 @staticmethod 2658 def _get_parser(model: BaseModel, config: Config) -> Parser: 2659 if isinstance(model, JsonDecoderModel): 2660 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2661 return JsonParser() 2662 elif isinstance(model, JsonlDecoderModel): 2663 return JsonLineParser() 2664 elif isinstance(model, CsvDecoderModel): 2665 return CsvParser( 2666 encoding=model.encoding, 2667 delimiter=model.delimiter, 2668 set_values_to_none=model.set_values_to_none, 2669 ) 2670 elif isinstance(model, GzipDecoderModel): 2671 return GzipParser( 2672 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2673 ) 2674 elif isinstance( 2675 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2676 ): 2677 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2678 2679 raise ValueError(f"Unknown decoder type {model}") 2680 2681 @staticmethod 2682 def create_json_file_schema_loader( 2683 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2684 ) -> JsonFileSchemaLoader: 2685 return JsonFileSchemaLoader( 2686 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2687 ) 2688 2689 @staticmethod 2690 def create_jwt_authenticator( 2691 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2692 ) -> JwtAuthenticator: 2693 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2694 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2695 return JwtAuthenticator( 2696 config=config, 2697 parameters=model.parameters or {}, 2698 algorithm=JwtAlgorithm(model.algorithm.value), 2699 secret_key=model.secret_key, 2700 base64_encode_secret_key=model.base64_encode_secret_key, 2701 token_duration=model.token_duration, 2702 header_prefix=model.header_prefix, 2703 kid=jwt_headers.kid, 2704 typ=jwt_headers.typ, 2705 cty=jwt_headers.cty, 2706 iss=jwt_payload.iss, 2707 sub=jwt_payload.sub, 2708 aud=jwt_payload.aud, 2709 additional_jwt_headers=model.additional_jwt_headers, 2710 additional_jwt_payload=model.additional_jwt_payload, 2711 ) 2712 2713 def create_list_partition_router( 2714 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2715 ) -> ListPartitionRouter: 2716 request_option = ( 2717 self._create_component_from_model(model.request_option, config) 2718 if model.request_option 2719 else None 2720 ) 2721 return ListPartitionRouter( 2722 cursor_field=model.cursor_field, 2723 request_option=request_option, 2724 values=model.values, 2725 config=config, 2726 parameters=model.parameters or {}, 2727 ) 2728 2729 @staticmethod 2730 def create_min_max_datetime( 2731 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2732 ) -> MinMaxDatetime: 2733 return MinMaxDatetime( 2734 datetime=model.datetime, 2735 datetime_format=model.datetime_format or "", 2736 max_datetime=model.max_datetime or "", 2737 min_datetime=model.min_datetime or "", 2738 parameters=model.parameters or {}, 2739 ) 2740 2741 @staticmethod 2742 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2743 return NoAuth(parameters=model.parameters or {}) 2744 2745 @staticmethod 2746 def create_no_pagination( 2747 model: NoPaginationModel, config: Config, **kwargs: Any 2748 ) -> NoPagination: 2749 return NoPagination(parameters={}) 2750 2751 def create_oauth_authenticator( 2752 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2753 ) -> DeclarativeOauth2Authenticator: 2754 profile_assertion = ( 2755 self._create_component_from_model(model.profile_assertion, config=config) 2756 if model.profile_assertion 2757 else None 2758 ) 2759 2760 if model.refresh_token_updater: 2761 # ignore type error because fixing it would have a lot of dependencies, revisit later 2762 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2763 config, 2764 InterpolatedString.create( 2765 model.token_refresh_endpoint, # type: ignore 2766 parameters=model.parameters or {}, 2767 ).eval(config), 2768 access_token_name=InterpolatedString.create( 2769 model.access_token_name or "access_token", parameters=model.parameters or {} 2770 ).eval(config), 2771 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2772 expires_in_name=InterpolatedString.create( 2773 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2774 ).eval(config), 2775 client_id_name=InterpolatedString.create( 2776 model.client_id_name or "client_id", parameters=model.parameters or {} 2777 ).eval(config), 2778 client_id=InterpolatedString.create( 2779 model.client_id, parameters=model.parameters or {} 2780 ).eval(config) 2781 if model.client_id 2782 else model.client_id, 2783 client_secret_name=InterpolatedString.create( 2784 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2785 ).eval(config), 2786 client_secret=InterpolatedString.create( 2787 model.client_secret, parameters=model.parameters or {} 2788 ).eval(config) 2789 if model.client_secret 2790 else model.client_secret, 2791 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2792 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2793 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2794 grant_type_name=InterpolatedString.create( 2795 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2796 ).eval(config), 2797 grant_type=InterpolatedString.create( 2798 model.grant_type or "refresh_token", parameters=model.parameters or {} 2799 ).eval(config), 2800 refresh_request_body=InterpolatedMapping( 2801 model.refresh_request_body or {}, parameters=model.parameters or {} 2802 ).eval(config), 2803 refresh_request_headers=InterpolatedMapping( 2804 model.refresh_request_headers or {}, parameters=model.parameters or {} 2805 ).eval(config), 2806 scopes=model.scopes, 2807 token_expiry_date_format=model.token_expiry_date_format, 2808 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2809 message_repository=self._message_repository, 2810 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2811 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2812 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2813 ) 2814 # ignore type error because fixing it would have a lot of dependencies, revisit later 2815 return DeclarativeOauth2Authenticator( # type: ignore 2816 access_token_name=model.access_token_name or "access_token", 2817 access_token_value=model.access_token_value, 2818 client_id_name=model.client_id_name or "client_id", 2819 client_id=model.client_id, 2820 client_secret_name=model.client_secret_name or "client_secret", 2821 client_secret=model.client_secret, 2822 expires_in_name=model.expires_in_name or "expires_in", 2823 grant_type_name=model.grant_type_name or "grant_type", 2824 grant_type=model.grant_type or "refresh_token", 2825 refresh_request_body=model.refresh_request_body, 2826 refresh_request_headers=model.refresh_request_headers, 2827 refresh_token_name=model.refresh_token_name or "refresh_token", 2828 refresh_token=model.refresh_token, 2829 scopes=model.scopes, 2830 token_expiry_date=model.token_expiry_date, 2831 token_expiry_date_format=model.token_expiry_date_format, 2832 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2833 token_refresh_endpoint=model.token_refresh_endpoint, 2834 config=config, 2835 parameters=model.parameters or {}, 2836 message_repository=self._message_repository, 2837 profile_assertion=profile_assertion, 2838 use_profile_assertion=model.use_profile_assertion, 2839 ) 2840 2841 def create_offset_increment( 2842 self, 2843 model: OffsetIncrementModel, 2844 config: Config, 2845 decoder: Decoder, 2846 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2847 **kwargs: Any, 2848 ) -> OffsetIncrement: 2849 if isinstance(decoder, PaginationDecoderDecorator): 2850 inner_decoder = decoder.decoder 2851 else: 2852 inner_decoder = decoder 2853 decoder = PaginationDecoderDecorator(decoder=decoder) 2854 2855 if self._is_supported_decoder_for_pagination(inner_decoder): 2856 decoder_to_use = decoder 2857 else: 2858 raise ValueError( 2859 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2860 ) 2861 2862 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2863 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2864 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2865 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2866 # When we have more time to investigate we can look into reusing the same component. 2867 extractor = ( 2868 self._create_component_from_model( 2869 model=extractor_model, config=config, decoder=decoder_to_use 2870 ) 2871 if extractor_model 2872 else None 2873 ) 2874 2875 return OffsetIncrement( 2876 page_size=model.page_size, 2877 config=config, 2878 decoder=decoder_to_use, 2879 extractor=extractor, 2880 inject_on_first_request=model.inject_on_first_request or False, 2881 parameters=model.parameters or {}, 2882 ) 2883 2884 @staticmethod 2885 def create_page_increment( 2886 model: PageIncrementModel, config: Config, **kwargs: Any 2887 ) -> PageIncrement: 2888 return PageIncrement( 2889 page_size=model.page_size, 2890 config=config, 2891 start_from_page=model.start_from_page or 0, 2892 inject_on_first_request=model.inject_on_first_request or False, 2893 parameters=model.parameters or {}, 2894 ) 2895 2896 def create_parent_stream_config( 2897 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2898 ) -> ParentStreamConfig: 2899 declarative_stream = self._create_component_from_model( 2900 model.stream, config=config, **kwargs 2901 ) 2902 request_option = ( 2903 self._create_component_from_model(model.request_option, config=config) 2904 if model.request_option 2905 else None 2906 ) 2907 2908 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2909 raise ValueError( 2910 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2911 ) 2912 2913 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2914 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2915 ) 2916 2917 return ParentStreamConfig( 2918 parent_key=model.parent_key, 2919 request_option=request_option, 2920 stream=declarative_stream, 2921 partition_field=model.partition_field, 2922 config=config, 2923 incremental_dependency=model.incremental_dependency or False, 2924 parameters=model.parameters or {}, 2925 extra_fields=model.extra_fields, 2926 lazy_read_pointer=model_lazy_read_pointer, 2927 ) 2928 2929 def create_properties_from_endpoint( 2930 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2931 ) -> PropertiesFromEndpoint: 2932 retriever = self._create_component_from_model( 2933 model=model.retriever, 2934 config=config, 2935 name="dynamic_properties", 2936 primary_key=None, 2937 stream_slicer=None, 2938 transformations=[], 2939 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2940 ) 2941 return PropertiesFromEndpoint( 2942 property_field_path=model.property_field_path, 2943 retriever=retriever, 2944 config=config, 2945 parameters=model.parameters or {}, 2946 ) 2947 2948 def create_property_chunking( 2949 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2950 ) -> PropertyChunking: 2951 record_merge_strategy = ( 2952 self._create_component_from_model( 2953 model=model.record_merge_strategy, config=config, **kwargs 2954 ) 2955 if model.record_merge_strategy 2956 else None 2957 ) 2958 2959 property_limit_type: PropertyLimitType 2960 match model.property_limit_type: 2961 case PropertyLimitTypeModel.property_count: 2962 property_limit_type = PropertyLimitType.property_count 2963 case PropertyLimitTypeModel.characters: 2964 property_limit_type = PropertyLimitType.characters 2965 case _: 2966 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2967 2968 return PropertyChunking( 2969 property_limit_type=property_limit_type, 2970 property_limit=model.property_limit, 2971 record_merge_strategy=record_merge_strategy, 2972 config=config, 2973 parameters=model.parameters or {}, 2974 ) 2975 2976 def create_query_properties( 2977 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2978 ) -> QueryProperties: 2979 if isinstance(model.property_list, list): 2980 property_list = model.property_list 2981 else: 2982 property_list = self._create_component_from_model( 2983 model=model.property_list, config=config, **kwargs 2984 ) 2985 2986 property_chunking = ( 2987 self._create_component_from_model( 2988 model=model.property_chunking, config=config, **kwargs 2989 ) 2990 if model.property_chunking 2991 else None 2992 ) 2993 2994 return QueryProperties( 2995 property_list=property_list, 2996 always_include_properties=model.always_include_properties, 2997 property_chunking=property_chunking, 2998 config=config, 2999 parameters=model.parameters or {}, 3000 ) 3001 3002 @staticmethod 3003 def create_record_filter( 3004 model: RecordFilterModel, config: Config, **kwargs: Any 3005 ) -> RecordFilter: 3006 return RecordFilter( 3007 condition=model.condition or "", config=config, parameters=model.parameters or {} 3008 ) 3009 3010 @staticmethod 3011 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3012 return RequestPath(parameters={}) 3013 3014 @staticmethod 3015 def create_request_option( 3016 model: RequestOptionModel, config: Config, **kwargs: Any 3017 ) -> RequestOption: 3018 inject_into = RequestOptionType(model.inject_into.value) 3019 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3020 [ 3021 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3022 for segment in model.field_path 3023 ] 3024 if model.field_path 3025 else None 3026 ) 3027 field_name = ( 3028 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3029 if model.field_name 3030 else None 3031 ) 3032 return RequestOption( 3033 field_name=field_name, 3034 field_path=field_path, 3035 inject_into=inject_into, 3036 parameters=kwargs.get("parameters", {}), 3037 ) 3038 3039 def create_record_selector( 3040 self, 3041 model: RecordSelectorModel, 3042 config: Config, 3043 *, 3044 name: str, 3045 transformations: List[RecordTransformation] | None = None, 3046 decoder: Decoder | None = None, 3047 client_side_incremental_sync: Dict[str, Any] | None = None, 3048 file_uploader: Optional[DefaultFileUploader] = None, 3049 **kwargs: Any, 3050 ) -> RecordSelector: 3051 extractor = self._create_component_from_model( 3052 model=model.extractor, decoder=decoder, config=config 3053 ) 3054 record_filter = ( 3055 self._create_component_from_model(model.record_filter, config=config) 3056 if model.record_filter 3057 else None 3058 ) 3059 3060 transform_before_filtering = ( 3061 False if model.transform_before_filtering is None else model.transform_before_filtering 3062 ) 3063 if client_side_incremental_sync: 3064 record_filter = ClientSideIncrementalRecordFilterDecorator( 3065 config=config, 3066 parameters=model.parameters, 3067 condition=model.record_filter.condition 3068 if (model.record_filter and hasattr(model.record_filter, "condition")) 3069 else None, 3070 **client_side_incremental_sync, 3071 ) 3072 transform_before_filtering = ( 3073 True 3074 if model.transform_before_filtering is None 3075 else model.transform_before_filtering 3076 ) 3077 3078 if model.schema_normalization is None: 3079 # default to no schema normalization if not set 3080 model.schema_normalization = SchemaNormalizationModel.None_ 3081 3082 schema_normalization = ( 3083 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3084 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3085 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3086 ) 3087 3088 return RecordSelector( 3089 extractor=extractor, 3090 name=name, 3091 config=config, 3092 record_filter=record_filter, 3093 transformations=transformations or [], 3094 file_uploader=file_uploader, 3095 schema_normalization=schema_normalization, 3096 parameters=model.parameters or {}, 3097 transform_before_filtering=transform_before_filtering, 3098 ) 3099 3100 @staticmethod 3101 def create_remove_fields( 3102 model: RemoveFieldsModel, config: Config, **kwargs: Any 3103 ) -> RemoveFields: 3104 return RemoveFields( 3105 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3106 ) 3107 3108 def create_selective_authenticator( 3109 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3110 ) -> DeclarativeAuthenticator: 3111 authenticators = { 3112 name: self._create_component_from_model(model=auth, config=config) 3113 for name, auth in model.authenticators.items() 3114 } 3115 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3116 return SelectiveAuthenticator( # type: ignore[abstract] 3117 config=config, 3118 authenticators=authenticators, 3119 authenticator_selection_path=model.authenticator_selection_path, 3120 **kwargs, 3121 ) 3122 3123 @staticmethod 3124 def create_legacy_session_token_authenticator( 3125 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3126 ) -> LegacySessionTokenAuthenticator: 3127 return LegacySessionTokenAuthenticator( 3128 api_url=url_base, 3129 header=model.header, 3130 login_url=model.login_url, 3131 password=model.password or "", 3132 session_token=model.session_token or "", 3133 session_token_response_key=model.session_token_response_key or "", 3134 username=model.username or "", 3135 validate_session_url=model.validate_session_url, 3136 config=config, 3137 parameters=model.parameters or {}, 3138 ) 3139 3140 def create_simple_retriever( 3141 self, 3142 model: SimpleRetrieverModel, 3143 config: Config, 3144 *, 3145 name: str, 3146 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3147 stream_slicer: Optional[StreamSlicer], 3148 request_options_provider: Optional[RequestOptionsProvider] = None, 3149 stop_condition_on_cursor: bool = False, 3150 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3151 transformations: List[RecordTransformation], 3152 file_uploader: Optional[DefaultFileUploader] = None, 3153 incremental_sync: Optional[ 3154 Union[ 3155 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 3156 ] 3157 ] = None, 3158 use_cache: Optional[bool] = None, 3159 log_formatter: Optional[Callable[[Response], Any]] = None, 3160 **kwargs: Any, 3161 ) -> SimpleRetriever: 3162 def _get_url() -> str: 3163 """ 3164 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3165 This is needed because the URL is not set until the requester is created. 3166 """ 3167 3168 _url: str = ( 3169 model.requester.url 3170 if hasattr(model.requester, "url") and model.requester.url is not None 3171 else requester.get_url() 3172 ) 3173 _url_base: str = ( 3174 model.requester.url_base 3175 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3176 else requester.get_url_base() 3177 ) 3178 3179 return _url or _url_base 3180 3181 decoder = ( 3182 self._create_component_from_model(model=model.decoder, config=config) 3183 if model.decoder 3184 else JsonDecoder(parameters={}) 3185 ) 3186 record_selector = self._create_component_from_model( 3187 model=model.record_selector, 3188 name=name, 3189 config=config, 3190 decoder=decoder, 3191 transformations=transformations, 3192 client_side_incremental_sync=client_side_incremental_sync, 3193 file_uploader=file_uploader, 3194 ) 3195 3196 query_properties: Optional[QueryProperties] = None 3197 query_properties_key: Optional[str] = None 3198 if self._query_properties_in_request_parameters(model.requester): 3199 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3200 # places instead of default to request_parameters which isn't clearly documented 3201 if ( 3202 hasattr(model.requester, "fetch_properties_from_endpoint") 3203 and model.requester.fetch_properties_from_endpoint 3204 ): 3205 raise ValueError( 3206 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3207 ) 3208 3209 query_properties_definitions = [] 3210 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3211 if isinstance(request_parameter, QueryPropertiesModel): 3212 query_properties_key = key 3213 query_properties_definitions.append(request_parameter) 3214 3215 if len(query_properties_definitions) > 1: 3216 raise ValueError( 3217 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3218 ) 3219 3220 if len(query_properties_definitions) == 1: 3221 query_properties = self._create_component_from_model( 3222 model=query_properties_definitions[0], config=config 3223 ) 3224 elif ( 3225 hasattr(model.requester, "fetch_properties_from_endpoint") 3226 and model.requester.fetch_properties_from_endpoint 3227 ): 3228 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3229 query_properties_definition = QueryPropertiesModel( 3230 type="QueryProperties", 3231 property_list=model.requester.fetch_properties_from_endpoint, 3232 always_include_properties=None, 3233 property_chunking=None, 3234 ) # type: ignore # $parameters has a default value 3235 3236 query_properties = self.create_query_properties( 3237 model=query_properties_definition, 3238 config=config, 3239 ) 3240 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3241 query_properties = self.create_query_properties( 3242 model=model.requester.query_properties, 3243 config=config, 3244 ) 3245 3246 requester = self._create_component_from_model( 3247 model=model.requester, 3248 decoder=decoder, 3249 name=name, 3250 query_properties_key=query_properties_key, 3251 use_cache=use_cache, 3252 config=config, 3253 ) 3254 3255 # Define cursor only if per partition or common incremental support is needed 3256 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3257 3258 if ( 3259 not isinstance(stream_slicer, DatetimeBasedCursor) 3260 or type(stream_slicer) is not DatetimeBasedCursor 3261 ): 3262 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3263 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3264 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3265 # request_options_provider 3266 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3267 elif not request_options_provider: 3268 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3269 3270 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3271 if self._should_limit_slices_fetched(): 3272 stream_slicer = cast( 3273 StreamSlicer, 3274 StreamSlicerTestReadDecorator( 3275 wrapped_slicer=stream_slicer, 3276 maximum_number_of_slices=self._limit_slices_fetched or 5, 3277 ), 3278 ) 3279 3280 cursor_used_for_stop_condition = cursor if stop_condition_on_cursor else None 3281 paginator = ( 3282 self._create_component_from_model( 3283 model=model.paginator, 3284 config=config, 3285 url_base=_get_url(), 3286 extractor_model=model.record_selector.extractor, 3287 decoder=decoder, 3288 cursor_used_for_stop_condition=cursor_used_for_stop_condition, 3289 ) 3290 if model.paginator 3291 else NoPagination(parameters={}) 3292 ) 3293 3294 ignore_stream_slicer_parameters_on_paginated_requests = ( 3295 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3296 ) 3297 3298 if ( 3299 model.partition_router 3300 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3301 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3302 and any( 3303 parent_stream_config.lazy_read_pointer 3304 for parent_stream_config in model.partition_router.parent_stream_configs 3305 ) 3306 ): 3307 if incremental_sync: 3308 if incremental_sync.type != "DatetimeBasedCursor": 3309 raise ValueError( 3310 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3311 ) 3312 3313 elif incremental_sync.step or incremental_sync.cursor_granularity: 3314 raise ValueError( 3315 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3316 ) 3317 3318 if model.decoder and model.decoder.type != "JsonDecoder": 3319 raise ValueError( 3320 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3321 ) 3322 3323 return LazySimpleRetriever( 3324 name=name, 3325 paginator=paginator, 3326 primary_key=primary_key, 3327 requester=requester, 3328 record_selector=record_selector, 3329 stream_slicer=stream_slicer, 3330 request_option_provider=request_options_provider, 3331 cursor=cursor, 3332 config=config, 3333 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3334 parameters=model.parameters or {}, 3335 ) 3336 3337 return SimpleRetriever( 3338 name=name, 3339 paginator=paginator, 3340 primary_key=primary_key, 3341 requester=requester, 3342 record_selector=record_selector, 3343 stream_slicer=stream_slicer, 3344 request_option_provider=request_options_provider, 3345 cursor=cursor, 3346 config=config, 3347 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3348 additional_query_properties=query_properties, 3349 log_formatter=self._get_log_formatter(log_formatter, name), 3350 parameters=model.parameters or {}, 3351 ) 3352 3353 def _get_log_formatter( 3354 self, log_formatter: Callable[[Response], Any] | None, name: str 3355 ) -> Callable[[Response], Any] | None: 3356 if self._should_limit_slices_fetched(): 3357 return ( 3358 ( 3359 lambda response: format_http_message( 3360 response, 3361 f"Stream '{name}' request", 3362 f"Request performed in order to extract records for stream '{name}'", 3363 name, 3364 ) 3365 ) 3366 if not log_formatter 3367 else log_formatter 3368 ) 3369 return None 3370 3371 def _should_limit_slices_fetched(self) -> bool: 3372 """ 3373 Returns True if the number of slices fetched should be limited, False otherwise. 3374 This is used to limit the number of slices fetched during tests. 3375 """ 3376 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3377 3378 @staticmethod 3379 def _query_properties_in_request_parameters( 3380 requester: Union[HttpRequesterModel, CustomRequesterModel], 3381 ) -> bool: 3382 if not hasattr(requester, "request_parameters"): 3383 return False 3384 request_parameters = requester.request_parameters 3385 if request_parameters and isinstance(request_parameters, Mapping): 3386 for request_parameter in request_parameters.values(): 3387 if isinstance(request_parameter, QueryPropertiesModel): 3388 return True 3389 return False 3390 3391 @staticmethod 3392 def _remove_query_properties( 3393 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3394 ) -> Mapping[str, str]: 3395 return { 3396 parameter_field: request_parameter 3397 for parameter_field, request_parameter in request_parameters.items() 3398 if not isinstance(request_parameter, QueryPropertiesModel) 3399 } 3400 3401 def create_state_delegating_stream( 3402 self, 3403 model: StateDelegatingStreamModel, 3404 config: Config, 3405 has_parent_state: Optional[bool] = None, 3406 **kwargs: Any, 3407 ) -> DeclarativeStream: 3408 if ( 3409 model.full_refresh_stream.name != model.name 3410 or model.name != model.incremental_stream.name 3411 ): 3412 raise ValueError( 3413 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3414 ) 3415 3416 stream_model = ( 3417 model.incremental_stream 3418 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3419 else model.full_refresh_stream 3420 ) 3421 3422 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description 3423 3424 def _create_async_job_status_mapping( 3425 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3426 ) -> Mapping[str, AsyncJobStatus]: 3427 api_status_to_cdk_status = {} 3428 for cdk_status, api_statuses in model.dict().items(): 3429 if cdk_status == "type": 3430 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3431 continue 3432 3433 for status in api_statuses: 3434 if status in api_status_to_cdk_status: 3435 raise ValueError( 3436 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3437 ) 3438 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3439 return api_status_to_cdk_status 3440 3441 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3442 match status: 3443 case "running": 3444 return AsyncJobStatus.RUNNING 3445 case "completed": 3446 return AsyncJobStatus.COMPLETED 3447 case "failed": 3448 return AsyncJobStatus.FAILED 3449 case "timeout": 3450 return AsyncJobStatus.TIMED_OUT 3451 case _: 3452 raise ValueError(f"Unsupported CDK status {status}") 3453 3454 def create_async_retriever( 3455 self, 3456 model: AsyncRetrieverModel, 3457 config: Config, 3458 *, 3459 name: str, 3460 primary_key: Optional[ 3461 Union[str, List[str], List[List[str]]] 3462 ], # this seems to be needed to match create_simple_retriever 3463 stream_slicer: Optional[StreamSlicer], 3464 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3465 transformations: List[RecordTransformation], 3466 **kwargs: Any, 3467 ) -> AsyncRetriever: 3468 def _get_download_retriever() -> SimpleRetriever: 3469 # We create a record selector for the download retriever 3470 # with no schema normalization and no transformations, neither record filter 3471 # as all this occurs in the record_selector of the AsyncRetriever 3472 record_selector = RecordSelector( 3473 extractor=download_extractor, 3474 name=name, 3475 record_filter=None, 3476 transformations=[], 3477 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3478 config=config, 3479 parameters={}, 3480 ) 3481 paginator = ( 3482 self._create_component_from_model( 3483 model=model.download_paginator, 3484 decoder=decoder, 3485 config=config, 3486 url_base="", 3487 ) 3488 if model.download_paginator 3489 else NoPagination(parameters={}) 3490 ) 3491 3492 return SimpleRetriever( 3493 requester=download_requester, 3494 record_selector=record_selector, 3495 primary_key=None, 3496 name=job_download_components_name, 3497 paginator=paginator, 3498 config=config, 3499 parameters={}, 3500 ) 3501 3502 def _get_job_timeout() -> datetime.timedelta: 3503 user_defined_timeout: Optional[int] = ( 3504 int( 3505 InterpolatedString.create( 3506 str(model.polling_job_timeout), 3507 parameters={}, 3508 ).eval(config) 3509 ) 3510 if model.polling_job_timeout 3511 else None 3512 ) 3513 3514 # check for user defined timeout during the test read or 15 minutes 3515 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3516 # default value for non-connector builder is 60 minutes. 3517 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3518 3519 return ( 3520 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3521 ) 3522 3523 decoder = ( 3524 self._create_component_from_model(model=model.decoder, config=config) 3525 if model.decoder 3526 else JsonDecoder(parameters={}) 3527 ) 3528 record_selector = self._create_component_from_model( 3529 model=model.record_selector, 3530 config=config, 3531 decoder=decoder, 3532 name=name, 3533 transformations=transformations, 3534 client_side_incremental_sync=client_side_incremental_sync, 3535 ) 3536 3537 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3538 if self._should_limit_slices_fetched(): 3539 stream_slicer = cast( 3540 StreamSlicer, 3541 StreamSlicerTestReadDecorator( 3542 wrapped_slicer=stream_slicer, 3543 maximum_number_of_slices=self._limit_slices_fetched or 5, 3544 ), 3545 ) 3546 3547 creation_requester = self._create_component_from_model( 3548 model=model.creation_requester, 3549 decoder=decoder, 3550 config=config, 3551 name=f"job creation - {name}", 3552 ) 3553 polling_requester = self._create_component_from_model( 3554 model=model.polling_requester, 3555 decoder=decoder, 3556 config=config, 3557 name=f"job polling - {name}", 3558 ) 3559 job_download_components_name = f"job download - {name}" 3560 download_decoder = ( 3561 self._create_component_from_model(model=model.download_decoder, config=config) 3562 if model.download_decoder 3563 else JsonDecoder(parameters={}) 3564 ) 3565 download_extractor = ( 3566 self._create_component_from_model( 3567 model=model.download_extractor, 3568 config=config, 3569 decoder=download_decoder, 3570 parameters=model.parameters, 3571 ) 3572 if model.download_extractor 3573 else DpathExtractor( 3574 [], 3575 config=config, 3576 decoder=download_decoder, 3577 parameters=model.parameters or {}, 3578 ) 3579 ) 3580 download_requester = self._create_component_from_model( 3581 model=model.download_requester, 3582 decoder=download_decoder, 3583 config=config, 3584 name=job_download_components_name, 3585 ) 3586 download_retriever = _get_download_retriever() 3587 abort_requester = ( 3588 self._create_component_from_model( 3589 model=model.abort_requester, 3590 decoder=decoder, 3591 config=config, 3592 name=f"job abort - {name}", 3593 ) 3594 if model.abort_requester 3595 else None 3596 ) 3597 delete_requester = ( 3598 self._create_component_from_model( 3599 model=model.delete_requester, 3600 decoder=decoder, 3601 config=config, 3602 name=f"job delete - {name}", 3603 ) 3604 if model.delete_requester 3605 else None 3606 ) 3607 download_target_requester = ( 3608 self._create_component_from_model( 3609 model=model.download_target_requester, 3610 decoder=decoder, 3611 config=config, 3612 name=f"job extract_url - {name}", 3613 ) 3614 if model.download_target_requester 3615 else None 3616 ) 3617 status_extractor = self._create_component_from_model( 3618 model=model.status_extractor, decoder=decoder, config=config, name=name 3619 ) 3620 download_target_extractor = self._create_component_from_model( 3621 model=model.download_target_extractor, 3622 decoder=decoder, 3623 config=config, 3624 name=name, 3625 ) 3626 3627 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3628 creation_requester=creation_requester, 3629 polling_requester=polling_requester, 3630 download_retriever=download_retriever, 3631 download_target_requester=download_target_requester, 3632 abort_requester=abort_requester, 3633 delete_requester=delete_requester, 3634 status_extractor=status_extractor, 3635 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3636 download_target_extractor=download_target_extractor, 3637 job_timeout=_get_job_timeout(), 3638 ) 3639 3640 async_job_partition_router = AsyncJobPartitionRouter( 3641 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3642 job_repository, 3643 stream_slices, 3644 self._job_tracker, 3645 self._message_repository, 3646 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3647 has_bulk_parent=False, 3648 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3649 # `None` == default retry is set to 3 attempts, under the hood. 3650 job_max_retry=1 if self._emit_connector_builder_messages else None, 3651 ), 3652 stream_slicer=stream_slicer, 3653 config=config, 3654 parameters=model.parameters or {}, 3655 ) 3656 3657 return AsyncRetriever( 3658 record_selector=record_selector, 3659 stream_slicer=async_job_partition_router, 3660 config=config, 3661 parameters=model.parameters or {}, 3662 ) 3663 3664 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3665 config_migrations = [ 3666 self._create_component_from_model(migration, config) 3667 for migration in ( 3668 model.config_normalization_rules.config_migrations 3669 if ( 3670 model.config_normalization_rules 3671 and model.config_normalization_rules.config_migrations 3672 ) 3673 else [] 3674 ) 3675 ] 3676 config_transformations = [ 3677 self._create_component_from_model(transformation, config) 3678 for transformation in ( 3679 model.config_normalization_rules.transformations 3680 if ( 3681 model.config_normalization_rules 3682 and model.config_normalization_rules.transformations 3683 ) 3684 else [] 3685 ) 3686 ] 3687 config_validations = [ 3688 self._create_component_from_model(validation, config) 3689 for validation in ( 3690 model.config_normalization_rules.validations 3691 if ( 3692 model.config_normalization_rules 3693 and model.config_normalization_rules.validations 3694 ) 3695 else [] 3696 ) 3697 ] 3698 3699 return Spec( 3700 connection_specification=model.connection_specification, 3701 documentation_url=model.documentation_url, 3702 advanced_auth=model.advanced_auth, 3703 parameters={}, 3704 config_migrations=config_migrations, 3705 config_transformations=config_transformations, 3706 config_validations=config_validations, 3707 ) 3708 3709 def create_substream_partition_router( 3710 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3711 ) -> SubstreamPartitionRouter: 3712 parent_stream_configs = [] 3713 if model.parent_stream_configs: 3714 parent_stream_configs.extend( 3715 [ 3716 self._create_message_repository_substream_wrapper( 3717 model=parent_stream_config, config=config, **kwargs 3718 ) 3719 for parent_stream_config in model.parent_stream_configs 3720 ] 3721 ) 3722 3723 return SubstreamPartitionRouter( 3724 parent_stream_configs=parent_stream_configs, 3725 parameters=model.parameters or {}, 3726 config=config, 3727 ) 3728 3729 def _create_message_repository_substream_wrapper( 3730 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 3731 ) -> Any: 3732 substream_factory = ModelToComponentFactory( 3733 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3734 limit_slices_fetched=self._limit_slices_fetched, 3735 emit_connector_builder_messages=self._emit_connector_builder_messages, 3736 disable_retries=self._disable_retries, 3737 disable_cache=self._disable_cache, 3738 message_repository=LogAppenderMessageRepositoryDecorator( 3739 {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}}, 3740 self._message_repository, 3741 self._evaluate_log_level(self._emit_connector_builder_messages), 3742 ), 3743 ) 3744 3745 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3746 has_parent_state = bool( 3747 self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None) 3748 if model.incremental_dependency 3749 else False 3750 ) 3751 return substream_factory._create_component_from_model( 3752 model=model, config=config, has_parent_state=has_parent_state, **kwargs 3753 ) 3754 3755 @staticmethod 3756 def create_wait_time_from_header( 3757 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3758 ) -> WaitTimeFromHeaderBackoffStrategy: 3759 return WaitTimeFromHeaderBackoffStrategy( 3760 header=model.header, 3761 parameters=model.parameters or {}, 3762 config=config, 3763 regex=model.regex, 3764 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3765 if model.max_waiting_time_in_seconds is not None 3766 else None, 3767 ) 3768 3769 @staticmethod 3770 def create_wait_until_time_from_header( 3771 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3772 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3773 return WaitUntilTimeFromHeaderBackoffStrategy( 3774 header=model.header, 3775 parameters=model.parameters or {}, 3776 config=config, 3777 min_wait=model.min_wait, 3778 regex=model.regex, 3779 ) 3780 3781 def get_message_repository(self) -> MessageRepository: 3782 return self._message_repository 3783 3784 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3785 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3786 3787 @staticmethod 3788 def create_components_mapping_definition( 3789 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3790 ) -> ComponentMappingDefinition: 3791 interpolated_value = InterpolatedString.create( 3792 model.value, parameters=model.parameters or {} 3793 ) 3794 field_path = [ 3795 InterpolatedString.create(path, parameters=model.parameters or {}) 3796 for path in model.field_path 3797 ] 3798 return ComponentMappingDefinition( 3799 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3800 value=interpolated_value, 3801 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3802 create_or_update=model.create_or_update, 3803 parameters=model.parameters or {}, 3804 ) 3805 3806 def create_http_components_resolver( 3807 self, model: HttpComponentsResolverModel, config: Config 3808 ) -> Any: 3809 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3810 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3811 3812 retriever = self._create_component_from_model( 3813 model=model.retriever, 3814 config=config, 3815 name="", 3816 primary_key=None, 3817 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3818 transformations=[], 3819 ) 3820 3821 components_mapping = [ 3822 self._create_component_from_model( 3823 model=components_mapping_definition_model, 3824 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3825 components_mapping_definition_model.value_type 3826 ), 3827 config=config, 3828 ) 3829 for components_mapping_definition_model in model.components_mapping 3830 ] 3831 3832 return HttpComponentsResolver( 3833 retriever=retriever, 3834 config=config, 3835 components_mapping=components_mapping, 3836 parameters=model.parameters or {}, 3837 ) 3838 3839 @staticmethod 3840 def create_stream_config( 3841 model: StreamConfigModel, config: Config, **kwargs: Any 3842 ) -> StreamConfig: 3843 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3844 [x for x in model.configs_pointer] if model.configs_pointer else [] 3845 ) 3846 3847 return StreamConfig( 3848 configs_pointer=model_configs_pointer, 3849 default_values=model.default_values, 3850 parameters=model.parameters or {}, 3851 ) 3852 3853 def create_config_components_resolver( 3854 self, model: ConfigComponentsResolverModel, config: Config 3855 ) -> Any: 3856 model_stream_configs = ( 3857 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3858 ) 3859 3860 stream_configs = [ 3861 self._create_component_from_model( 3862 stream_config, config=config, parameters=model.parameters or {} 3863 ) 3864 for stream_config in model_stream_configs 3865 ] 3866 3867 components_mapping = [ 3868 self._create_component_from_model( 3869 model=components_mapping_definition_model, 3870 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3871 components_mapping_definition_model.value_type 3872 ), 3873 config=config, 3874 ) 3875 for components_mapping_definition_model in model.components_mapping 3876 ] 3877 3878 return ConfigComponentsResolver( 3879 stream_configs=stream_configs, 3880 config=config, 3881 components_mapping=components_mapping, 3882 parameters=model.parameters or {}, 3883 ) 3884 3885 def create_parametrized_components_resolver( 3886 self, model: ParametrizedComponentsResolverModel, config: Config 3887 ) -> ParametrizedComponentsResolver: 3888 stream_parameters = StreamParametersDefinition( 3889 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 3890 ) 3891 components_mapping = [ 3892 self._create_component_from_model( 3893 model=components_mapping_definition_model, 3894 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3895 components_mapping_definition_model.value_type 3896 ), 3897 config=config, 3898 ) 3899 for components_mapping_definition_model in model.components_mapping 3900 ] 3901 return ParametrizedComponentsResolver( 3902 stream_parameters=stream_parameters, 3903 config=config, 3904 components_mapping=components_mapping, 3905 parameters=model.parameters or {}, 3906 ) 3907 3908 _UNSUPPORTED_DECODER_ERROR = ( 3909 "Specified decoder of {decoder_type} is not supported for pagination." 3910 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 3911 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 3912 ) 3913 3914 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 3915 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 3916 return True 3917 elif isinstance(decoder, CompositeRawDecoder): 3918 return self._is_supported_parser_for_pagination(decoder.parser) 3919 else: 3920 return False 3921 3922 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 3923 if isinstance(parser, JsonParser): 3924 return True 3925 elif isinstance(parser, GzipParser): 3926 return isinstance(parser.inner_parser, JsonParser) 3927 else: 3928 return False 3929 3930 def create_http_api_budget( 3931 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3932 ) -> HttpAPIBudget: 3933 policies = [ 3934 self._create_component_from_model(model=policy, config=config) 3935 for policy in model.policies 3936 ] 3937 3938 return HttpAPIBudget( 3939 policies=policies, 3940 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 3941 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 3942 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 3943 ) 3944 3945 def create_fixed_window_call_rate_policy( 3946 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 3947 ) -> FixedWindowCallRatePolicy: 3948 matchers = [ 3949 self._create_component_from_model(model=matcher, config=config) 3950 for matcher in model.matchers 3951 ] 3952 3953 # Set the initial reset timestamp to 10 days from now. 3954 # This value will be updated by the first request. 3955 return FixedWindowCallRatePolicy( 3956 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 3957 period=parse_duration(model.period), 3958 call_limit=model.call_limit, 3959 matchers=matchers, 3960 ) 3961 3962 def create_file_uploader( 3963 self, model: FileUploaderModel, config: Config, **kwargs: Any 3964 ) -> FileUploader: 3965 name = "File Uploader" 3966 requester = self._create_component_from_model( 3967 model=model.requester, 3968 config=config, 3969 name=name, 3970 **kwargs, 3971 ) 3972 download_target_extractor = self._create_component_from_model( 3973 model=model.download_target_extractor, 3974 config=config, 3975 name=name, 3976 **kwargs, 3977 ) 3978 emit_connector_builder_messages = self._emit_connector_builder_messages 3979 file_uploader = DefaultFileUploader( 3980 requester=requester, 3981 download_target_extractor=download_target_extractor, 3982 config=config, 3983 file_writer=NoopFileWriter() 3984 if emit_connector_builder_messages 3985 else LocalFileSystemFileWriter(), 3986 parameters=model.parameters or {}, 3987 filename_extractor=model.filename_extractor if model.filename_extractor else None, 3988 ) 3989 3990 return ( 3991 ConnectorBuilderFileUploader(file_uploader) 3992 if emit_connector_builder_messages 3993 else file_uploader 3994 ) 3995 3996 def create_moving_window_call_rate_policy( 3997 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 3998 ) -> MovingWindowCallRatePolicy: 3999 rates = [ 4000 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4001 ] 4002 matchers = [ 4003 self._create_component_from_model(model=matcher, config=config) 4004 for matcher in model.matchers 4005 ] 4006 return MovingWindowCallRatePolicy( 4007 rates=rates, 4008 matchers=matchers, 4009 ) 4010 4011 def create_unlimited_call_rate_policy( 4012 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4013 ) -> UnlimitedCallRatePolicy: 4014 matchers = [ 4015 self._create_component_from_model(model=matcher, config=config) 4016 for matcher in model.matchers 4017 ] 4018 4019 return UnlimitedCallRatePolicy( 4020 matchers=matchers, 4021 ) 4022 4023 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4024 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4025 return Rate( 4026 limit=int(interpolated_limit.eval(config=config)), 4027 interval=parse_duration(model.interval), 4028 ) 4029 4030 def create_http_request_matcher( 4031 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4032 ) -> HttpRequestRegexMatcher: 4033 return HttpRequestRegexMatcher( 4034 method=model.method, 4035 url_base=model.url_base, 4036 url_path_pattern=model.url_path_pattern, 4037 params=model.params, 4038 headers=model.headers, 4039 ) 4040 4041 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4042 self._api_budget = self.create_component( 4043 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4044 ) 4045 4046 def create_grouping_partition_router( 4047 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 4048 ) -> GroupingPartitionRouter: 4049 underlying_router = self._create_component_from_model( 4050 model=model.underlying_partition_router, config=config 4051 ) 4052 if model.group_size < 1: 4053 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4054 4055 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4056 # because they are specific to individual partitions and cannot be aggregated or handled 4057 # when grouping, potentially leading to incorrect API calls. Any request customization 4058 # should be managed at the stream level through the requester's configuration. 4059 if isinstance(underlying_router, SubstreamPartitionRouter): 4060 if any( 4061 parent_config.request_option 4062 for parent_config in underlying_router.parent_stream_configs 4063 ): 4064 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4065 4066 if isinstance(underlying_router, ListPartitionRouter): 4067 if underlying_router.request_option: 4068 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4069 4070 return GroupingPartitionRouter( 4071 group_size=model.group_size, 4072 underlying_partition_router=underlying_router, 4073 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4074 config=config, 4075 )
628class ModelToComponentFactory: 629 EPOCH_DATETIME_FORMAT = "%s" 630 631 def __init__( 632 self, 633 limit_pages_fetched_per_slice: Optional[int] = None, 634 limit_slices_fetched: Optional[int] = None, 635 emit_connector_builder_messages: bool = False, 636 disable_retries: bool = False, 637 disable_cache: bool = False, 638 disable_resumable_full_refresh: bool = False, 639 message_repository: Optional[MessageRepository] = None, 640 connector_state_manager: Optional[ConnectorStateManager] = None, 641 max_concurrent_async_job_count: Optional[int] = None, 642 ): 643 self._init_mappings() 644 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 645 self._limit_slices_fetched = limit_slices_fetched 646 self._emit_connector_builder_messages = emit_connector_builder_messages 647 self._disable_retries = disable_retries 648 self._disable_cache = disable_cache 649 self._disable_resumable_full_refresh = disable_resumable_full_refresh 650 self._message_repository = message_repository or InMemoryMessageRepository( 651 self._evaluate_log_level(emit_connector_builder_messages) 652 ) 653 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 654 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 655 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 656 # placeholder for deprecation warnings 657 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 658 659 def _init_mappings(self) -> None: 660 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 661 AddedFieldDefinitionModel: self.create_added_field_definition, 662 AddFieldsModel: self.create_add_fields, 663 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 664 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 665 BearerAuthenticatorModel: self.create_bearer_authenticator, 666 CheckStreamModel: self.create_check_stream, 667 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 668 CheckDynamicStreamModel: self.create_check_dynamic_stream, 669 CompositeErrorHandlerModel: self.create_composite_error_handler, 670 ConcurrencyLevelModel: self.create_concurrency_level, 671 ConfigMigrationModel: self.create_config_migration, 672 ConfigAddFieldsModel: self.create_config_add_fields, 673 ConfigRemapFieldModel: self.create_config_remap_field, 674 ConfigRemoveFieldsModel: self.create_config_remove_fields, 675 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 676 CsvDecoderModel: self.create_csv_decoder, 677 CursorPaginationModel: self.create_cursor_pagination, 678 CustomAuthenticatorModel: self.create_custom_component, 679 CustomBackoffStrategyModel: self.create_custom_component, 680 CustomDecoderModel: self.create_custom_component, 681 CustomErrorHandlerModel: self.create_custom_component, 682 CustomIncrementalSyncModel: self.create_custom_component, 683 CustomRecordExtractorModel: self.create_custom_component, 684 CustomRecordFilterModel: self.create_custom_component, 685 CustomRequesterModel: self.create_custom_component, 686 CustomRetrieverModel: self.create_custom_component, 687 CustomSchemaLoader: self.create_custom_component, 688 CustomSchemaNormalizationModel: self.create_custom_component, 689 CustomStateMigration: self.create_custom_component, 690 CustomPaginationStrategyModel: self.create_custom_component, 691 CustomPartitionRouterModel: self.create_custom_component, 692 CustomTransformationModel: self.create_custom_component, 693 CustomValidationStrategyModel: self.create_custom_component, 694 CustomConfigTransformationModel: self.create_custom_component, 695 DatetimeBasedCursorModel: self.create_datetime_based_cursor, 696 DeclarativeStreamModel: self.create_declarative_stream, 697 DefaultErrorHandlerModel: self.create_default_error_handler, 698 DefaultPaginatorModel: self.create_default_paginator, 699 DpathExtractorModel: self.create_dpath_extractor, 700 DpathValidatorModel: self.create_dpath_validator, 701 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 702 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 703 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 704 GroupByKeyMergeStrategyModel: self.create_group_by_key, 705 HttpRequesterModel: self.create_http_requester, 706 HttpResponseFilterModel: self.create_http_response_filter, 707 InlineSchemaLoaderModel: self.create_inline_schema_loader, 708 JsonDecoderModel: self.create_json_decoder, 709 JsonlDecoderModel: self.create_jsonl_decoder, 710 GzipDecoderModel: self.create_gzip_decoder, 711 KeysToLowerModel: self.create_keys_to_lower_transformation, 712 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 713 KeysReplaceModel: self.create_keys_replace_transformation, 714 FlattenFieldsModel: self.create_flatten_fields, 715 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 716 IterableDecoderModel: self.create_iterable_decoder, 717 IncrementingCountCursorModel: self.create_incrementing_count_cursor, 718 XmlDecoderModel: self.create_xml_decoder, 719 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 720 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 721 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 722 TypesMapModel: self.create_types_map, 723 ComplexFieldTypeModel: self.create_complex_field_type, 724 JwtAuthenticatorModel: self.create_jwt_authenticator, 725 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 726 ListPartitionRouterModel: self.create_list_partition_router, 727 MinMaxDatetimeModel: self.create_min_max_datetime, 728 NoAuthModel: self.create_no_auth, 729 NoPaginationModel: self.create_no_pagination, 730 OAuthAuthenticatorModel: self.create_oauth_authenticator, 731 OffsetIncrementModel: self.create_offset_increment, 732 PageIncrementModel: self.create_page_increment, 733 ParentStreamConfigModel: self.create_parent_stream_config, 734 PredicateValidatorModel: self.create_predicate_validator, 735 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 736 PropertyChunkingModel: self.create_property_chunking, 737 QueryPropertiesModel: self.create_query_properties, 738 RecordFilterModel: self.create_record_filter, 739 RecordSelectorModel: self.create_record_selector, 740 RemoveFieldsModel: self.create_remove_fields, 741 RequestPathModel: self.create_request_path, 742 RequestOptionModel: self.create_request_option, 743 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 744 SelectiveAuthenticatorModel: self.create_selective_authenticator, 745 SimpleRetrieverModel: self.create_simple_retriever, 746 StateDelegatingStreamModel: self.create_state_delegating_stream, 747 SpecModel: self.create_spec, 748 SubstreamPartitionRouterModel: self.create_substream_partition_router, 749 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 750 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 751 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 752 AsyncRetrieverModel: self.create_async_retriever, 753 HttpComponentsResolverModel: self.create_http_components_resolver, 754 ConfigComponentsResolverModel: self.create_config_components_resolver, 755 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 756 StreamConfigModel: self.create_stream_config, 757 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 758 ZipfileDecoderModel: self.create_zipfile_decoder, 759 HTTPAPIBudgetModel: self.create_http_api_budget, 760 FileUploaderModel: self.create_file_uploader, 761 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 762 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 763 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 764 RateModel: self.create_rate, 765 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 766 GroupingPartitionRouterModel: self.create_grouping_partition_router, 767 } 768 769 # Needed for the case where we need to perform a second parse on the fields of a custom component 770 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 771 772 def create_component( 773 self, 774 model_type: Type[BaseModel], 775 component_definition: ComponentDefinition, 776 config: Config, 777 **kwargs: Any, 778 ) -> Any: 779 """ 780 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 781 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 782 creating declarative components from that model. 783 784 :param model_type: The type of declarative component that is being initialized 785 :param component_definition: The mapping that represents a declarative component 786 :param config: The connector config that is provided by the customer 787 :return: The declarative component to be used at runtime 788 """ 789 790 component_type = component_definition.get("type") 791 if component_definition.get("type") != model_type.__name__: 792 raise ValueError( 793 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 794 ) 795 796 declarative_component_model = model_type.parse_obj(component_definition) 797 798 if not isinstance(declarative_component_model, model_type): 799 raise ValueError( 800 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 801 ) 802 803 return self._create_component_from_model( 804 model=declarative_component_model, config=config, **kwargs 805 ) 806 807 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 808 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 809 raise ValueError( 810 f"{model.__class__} with attributes {model} is not a valid component type" 811 ) 812 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 813 if not component_constructor: 814 raise ValueError(f"Could not find constructor for {model.__class__}") 815 816 # collect deprecation warnings for supported models. 817 if isinstance(model, BaseModelWithDeprecations): 818 self._collect_model_deprecations(model) 819 820 return component_constructor(model=model, config=config, **kwargs) 821 822 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 823 """ 824 Returns the deprecation warnings that were collected during the creation of components. 825 """ 826 return self._collected_deprecation_logs 827 828 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 829 """ 830 Collects deprecation logs from the given model and appends any new logs to the internal collection. 831 832 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 833 834 Args: 835 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 836 """ 837 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 838 for log in model._deprecation_logs: 839 # avoid duplicates for deprecation logs observed. 840 if log not in self._collected_deprecation_logs: 841 self._collected_deprecation_logs.append(log) 842 843 def create_config_migration( 844 self, model: ConfigMigrationModel, config: Config 845 ) -> ConfigMigration: 846 transformations: List[ConfigTransformation] = [ 847 self._create_component_from_model(transformation, config) 848 for transformation in model.transformations 849 ] 850 851 return ConfigMigration( 852 description=model.description, 853 transformations=transformations, 854 ) 855 856 def create_config_add_fields( 857 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 858 ) -> ConfigAddFields: 859 fields = [self._create_component_from_model(field, config) for field in model.fields] 860 return ConfigAddFields( 861 fields=fields, 862 condition=model.condition or "", 863 ) 864 865 @staticmethod 866 def create_config_remove_fields( 867 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 868 ) -> ConfigRemoveFields: 869 return ConfigRemoveFields( 870 field_pointers=model.field_pointers, 871 condition=model.condition or "", 872 ) 873 874 @staticmethod 875 def create_config_remap_field( 876 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 877 ) -> ConfigRemapField: 878 mapping = cast(Mapping[str, Any], model.map) 879 return ConfigRemapField( 880 map=mapping, 881 field_path=model.field_path, 882 config=config, 883 ) 884 885 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 886 strategy = self._create_component_from_model(model.validation_strategy, config) 887 888 return DpathValidator( 889 field_path=model.field_path, 890 strategy=strategy, 891 ) 892 893 def create_predicate_validator( 894 self, model: PredicateValidatorModel, config: Config 895 ) -> PredicateValidator: 896 strategy = self._create_component_from_model(model.validation_strategy, config) 897 898 return PredicateValidator( 899 value=model.value, 900 strategy=strategy, 901 ) 902 903 @staticmethod 904 def create_validate_adheres_to_schema( 905 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 906 ) -> ValidateAdheresToSchema: 907 base_schema = cast(Mapping[str, Any], model.base_schema) 908 return ValidateAdheresToSchema( 909 schema=base_schema, 910 ) 911 912 @staticmethod 913 def create_added_field_definition( 914 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 915 ) -> AddedFieldDefinition: 916 interpolated_value = InterpolatedString.create( 917 model.value, parameters=model.parameters or {} 918 ) 919 return AddedFieldDefinition( 920 path=model.path, 921 value=interpolated_value, 922 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 923 parameters=model.parameters or {}, 924 ) 925 926 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 927 added_field_definitions = [ 928 self._create_component_from_model( 929 model=added_field_definition_model, 930 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 931 added_field_definition_model.value_type 932 ), 933 config=config, 934 ) 935 for added_field_definition_model in model.fields 936 ] 937 return AddFields( 938 fields=added_field_definitions, 939 condition=model.condition or "", 940 parameters=model.parameters or {}, 941 ) 942 943 def create_keys_to_lower_transformation( 944 self, model: KeysToLowerModel, config: Config, **kwargs: Any 945 ) -> KeysToLowerTransformation: 946 return KeysToLowerTransformation() 947 948 def create_keys_to_snake_transformation( 949 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 950 ) -> KeysToSnakeCaseTransformation: 951 return KeysToSnakeCaseTransformation() 952 953 def create_keys_replace_transformation( 954 self, model: KeysReplaceModel, config: Config, **kwargs: Any 955 ) -> KeysReplaceTransformation: 956 return KeysReplaceTransformation( 957 old=model.old, new=model.new, parameters=model.parameters or {} 958 ) 959 960 def create_flatten_fields( 961 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 962 ) -> FlattenFields: 963 return FlattenFields( 964 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 965 ) 966 967 def create_dpath_flatten_fields( 968 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 969 ) -> DpathFlattenFields: 970 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 971 key_transformation = ( 972 KeyTransformation( 973 config=config, 974 prefix=model.key_transformation.prefix, 975 suffix=model.key_transformation.suffix, 976 parameters=model.parameters or {}, 977 ) 978 if model.key_transformation is not None 979 else None 980 ) 981 return DpathFlattenFields( 982 config=config, 983 field_path=model_field_path, 984 delete_origin_value=model.delete_origin_value 985 if model.delete_origin_value is not None 986 else False, 987 replace_record=model.replace_record if model.replace_record is not None else False, 988 key_transformation=key_transformation, 989 parameters=model.parameters or {}, 990 ) 991 992 @staticmethod 993 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 994 if not value_type: 995 return None 996 names_to_types = { 997 ValueType.string: str, 998 ValueType.number: float, 999 ValueType.integer: int, 1000 ValueType.boolean: bool, 1001 } 1002 return names_to_types[value_type] 1003 1004 def create_api_key_authenticator( 1005 self, 1006 model: ApiKeyAuthenticatorModel, 1007 config: Config, 1008 token_provider: Optional[TokenProvider] = None, 1009 **kwargs: Any, 1010 ) -> ApiKeyAuthenticator: 1011 if model.inject_into is None and model.header is None: 1012 raise ValueError( 1013 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1014 ) 1015 1016 if model.inject_into is not None and model.header is not None: 1017 raise ValueError( 1018 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1019 ) 1020 1021 if token_provider is not None and model.api_token != "": 1022 raise ValueError( 1023 "If token_provider is set, api_token is ignored and has to be set to empty string." 1024 ) 1025 1026 request_option = ( 1027 self._create_component_from_model( 1028 model.inject_into, config, parameters=model.parameters or {} 1029 ) 1030 if model.inject_into 1031 else RequestOption( 1032 inject_into=RequestOptionType.header, 1033 field_name=model.header or "", 1034 parameters=model.parameters or {}, 1035 ) 1036 ) 1037 1038 return ApiKeyAuthenticator( 1039 token_provider=( 1040 token_provider 1041 if token_provider is not None 1042 else InterpolatedStringTokenProvider( 1043 api_token=model.api_token or "", 1044 config=config, 1045 parameters=model.parameters or {}, 1046 ) 1047 ), 1048 request_option=request_option, 1049 config=config, 1050 parameters=model.parameters or {}, 1051 ) 1052 1053 def create_legacy_to_per_partition_state_migration( 1054 self, 1055 model: LegacyToPerPartitionStateMigrationModel, 1056 config: Mapping[str, Any], 1057 declarative_stream: DeclarativeStreamModel, 1058 ) -> LegacyToPerPartitionStateMigration: 1059 retriever = declarative_stream.retriever 1060 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1061 raise ValueError( 1062 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1063 ) 1064 partition_router = retriever.partition_router 1065 if not isinstance( 1066 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1067 ): 1068 raise ValueError( 1069 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1070 ) 1071 if not hasattr(partition_router, "parent_stream_configs"): 1072 raise ValueError( 1073 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1074 ) 1075 1076 if not hasattr(declarative_stream, "incremental_sync"): 1077 raise ValueError( 1078 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1079 ) 1080 1081 return LegacyToPerPartitionStateMigration( 1082 partition_router, # type: ignore # was already checked above 1083 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1084 config, 1085 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1086 ) 1087 1088 def create_session_token_authenticator( 1089 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1090 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1091 decoder = ( 1092 self._create_component_from_model(model=model.decoder, config=config) 1093 if model.decoder 1094 else JsonDecoder(parameters={}) 1095 ) 1096 login_requester = self._create_component_from_model( 1097 model=model.login_requester, 1098 config=config, 1099 name=f"{name}_login_requester", 1100 decoder=decoder, 1101 ) 1102 token_provider = SessionTokenProvider( 1103 login_requester=login_requester, 1104 session_token_path=model.session_token_path, 1105 expiration_duration=parse_duration(model.expiration_duration) 1106 if model.expiration_duration 1107 else None, 1108 parameters=model.parameters or {}, 1109 message_repository=self._message_repository, 1110 decoder=decoder, 1111 ) 1112 if model.request_authentication.type == "Bearer": 1113 return ModelToComponentFactory.create_bearer_authenticator( 1114 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1115 config, 1116 token_provider=token_provider, 1117 ) 1118 else: 1119 return self.create_api_key_authenticator( 1120 ApiKeyAuthenticatorModel( 1121 type="ApiKeyAuthenticator", 1122 api_token="", 1123 inject_into=model.request_authentication.inject_into, 1124 ), # type: ignore # $parameters and headers default to None 1125 config=config, 1126 token_provider=token_provider, 1127 ) 1128 1129 @staticmethod 1130 def create_basic_http_authenticator( 1131 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1132 ) -> BasicHttpAuthenticator: 1133 return BasicHttpAuthenticator( 1134 password=model.password or "", 1135 username=model.username, 1136 config=config, 1137 parameters=model.parameters or {}, 1138 ) 1139 1140 @staticmethod 1141 def create_bearer_authenticator( 1142 model: BearerAuthenticatorModel, 1143 config: Config, 1144 token_provider: Optional[TokenProvider] = None, 1145 **kwargs: Any, 1146 ) -> BearerAuthenticator: 1147 if token_provider is not None and model.api_token != "": 1148 raise ValueError( 1149 "If token_provider is set, api_token is ignored and has to be set to empty string." 1150 ) 1151 return BearerAuthenticator( 1152 token_provider=( 1153 token_provider 1154 if token_provider is not None 1155 else InterpolatedStringTokenProvider( 1156 api_token=model.api_token or "", 1157 config=config, 1158 parameters=model.parameters or {}, 1159 ) 1160 ), 1161 config=config, 1162 parameters=model.parameters or {}, 1163 ) 1164 1165 @staticmethod 1166 def create_dynamic_stream_check_config( 1167 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1168 ) -> DynamicStreamCheckConfig: 1169 return DynamicStreamCheckConfig( 1170 dynamic_stream_name=model.dynamic_stream_name, 1171 stream_count=model.stream_count or 0, 1172 ) 1173 1174 def create_check_stream( 1175 self, model: CheckStreamModel, config: Config, **kwargs: Any 1176 ) -> CheckStream: 1177 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1178 raise ValueError( 1179 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1180 ) 1181 1182 dynamic_streams_check_configs = ( 1183 [ 1184 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1185 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1186 ] 1187 if model.dynamic_streams_check_configs 1188 else [] 1189 ) 1190 1191 return CheckStream( 1192 stream_names=model.stream_names or [], 1193 dynamic_streams_check_configs=dynamic_streams_check_configs, 1194 parameters={}, 1195 ) 1196 1197 @staticmethod 1198 def create_check_dynamic_stream( 1199 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1200 ) -> CheckDynamicStream: 1201 assert model.use_check_availability is not None # for mypy 1202 1203 use_check_availability = model.use_check_availability 1204 1205 return CheckDynamicStream( 1206 stream_count=model.stream_count, 1207 use_check_availability=use_check_availability, 1208 parameters={}, 1209 ) 1210 1211 def create_composite_error_handler( 1212 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1213 ) -> CompositeErrorHandler: 1214 error_handlers = [ 1215 self._create_component_from_model(model=error_handler_model, config=config) 1216 for error_handler_model in model.error_handlers 1217 ] 1218 return CompositeErrorHandler( 1219 error_handlers=error_handlers, parameters=model.parameters or {} 1220 ) 1221 1222 @staticmethod 1223 def create_concurrency_level( 1224 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1225 ) -> ConcurrencyLevel: 1226 return ConcurrencyLevel( 1227 default_concurrency=model.default_concurrency, 1228 max_concurrency=model.max_concurrency, 1229 config=config, 1230 parameters={}, 1231 ) 1232 1233 @staticmethod 1234 def apply_stream_state_migrations( 1235 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1236 ) -> MutableMapping[str, Any]: 1237 if stream_state_migrations: 1238 for state_migration in stream_state_migrations: 1239 if state_migration.should_migrate(stream_state): 1240 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1241 stream_state = dict(state_migration.migrate(stream_state)) 1242 return stream_state 1243 1244 def create_concurrent_cursor_from_datetime_based_cursor( 1245 self, 1246 model_type: Type[BaseModel], 1247 component_definition: ComponentDefinition, 1248 stream_name: str, 1249 stream_namespace: Optional[str], 1250 config: Config, 1251 message_repository: Optional[MessageRepository] = None, 1252 runtime_lookback_window: Optional[datetime.timedelta] = None, 1253 stream_state_migrations: Optional[List[Any]] = None, 1254 **kwargs: Any, 1255 ) -> ConcurrentCursor: 1256 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1257 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1258 # incoming state and connector_state_manager that is initialized when the component factory is created 1259 stream_state = ( 1260 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1261 if "stream_state" not in kwargs 1262 else kwargs["stream_state"] 1263 ) 1264 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1265 1266 component_type = component_definition.get("type") 1267 if component_definition.get("type") != model_type.__name__: 1268 raise ValueError( 1269 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1270 ) 1271 1272 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1273 1274 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1275 raise ValueError( 1276 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1277 ) 1278 1279 interpolated_cursor_field = InterpolatedString.create( 1280 datetime_based_cursor_model.cursor_field, 1281 parameters=datetime_based_cursor_model.parameters or {}, 1282 ) 1283 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1284 1285 interpolated_partition_field_start = InterpolatedString.create( 1286 datetime_based_cursor_model.partition_field_start or "start_time", 1287 parameters=datetime_based_cursor_model.parameters or {}, 1288 ) 1289 interpolated_partition_field_end = InterpolatedString.create( 1290 datetime_based_cursor_model.partition_field_end or "end_time", 1291 parameters=datetime_based_cursor_model.parameters or {}, 1292 ) 1293 1294 slice_boundary_fields = ( 1295 interpolated_partition_field_start.eval(config=config), 1296 interpolated_partition_field_end.eval(config=config), 1297 ) 1298 1299 datetime_format = datetime_based_cursor_model.datetime_format 1300 1301 cursor_granularity = ( 1302 parse_duration(datetime_based_cursor_model.cursor_granularity) 1303 if datetime_based_cursor_model.cursor_granularity 1304 else None 1305 ) 1306 1307 lookback_window = None 1308 interpolated_lookback_window = ( 1309 InterpolatedString.create( 1310 datetime_based_cursor_model.lookback_window, 1311 parameters=datetime_based_cursor_model.parameters or {}, 1312 ) 1313 if datetime_based_cursor_model.lookback_window 1314 else None 1315 ) 1316 if interpolated_lookback_window: 1317 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1318 if evaluated_lookback_window: 1319 lookback_window = parse_duration(evaluated_lookback_window) 1320 1321 connector_state_converter: DateTimeStreamStateConverter 1322 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1323 datetime_format=datetime_format, 1324 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1325 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1326 cursor_granularity=cursor_granularity, 1327 ) 1328 1329 # Adjusts the stream state by applying the runtime lookback window. 1330 # This is used to ensure correct state handling in case of failed partitions. 1331 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1332 if runtime_lookback_window and stream_state_value: 1333 new_stream_state = ( 1334 connector_state_converter.parse_timestamp(stream_state_value) 1335 - runtime_lookback_window 1336 ) 1337 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1338 new_stream_state 1339 ) 1340 1341 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1342 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1343 start_date_runtime_value = self.create_min_max_datetime( 1344 model=datetime_based_cursor_model.start_datetime, config=config 1345 ) 1346 else: 1347 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1348 1349 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1350 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1351 end_date_runtime_value = self.create_min_max_datetime( 1352 model=datetime_based_cursor_model.end_datetime, config=config 1353 ) 1354 else: 1355 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1356 1357 interpolated_start_date = MinMaxDatetime.create( 1358 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1359 parameters=datetime_based_cursor_model.parameters, 1360 ) 1361 interpolated_end_date = ( 1362 None 1363 if not end_date_runtime_value 1364 else MinMaxDatetime.create( 1365 end_date_runtime_value, datetime_based_cursor_model.parameters 1366 ) 1367 ) 1368 1369 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1370 if not interpolated_start_date.datetime_format: 1371 interpolated_start_date.datetime_format = datetime_format 1372 if interpolated_end_date and not interpolated_end_date.datetime_format: 1373 interpolated_end_date.datetime_format = datetime_format 1374 1375 start_date = interpolated_start_date.get_datetime(config=config) 1376 end_date_provider = ( 1377 partial(interpolated_end_date.get_datetime, config) 1378 if interpolated_end_date 1379 else connector_state_converter.get_end_provider() 1380 ) 1381 1382 if ( 1383 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1384 ) or ( 1385 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1386 ): 1387 raise ValueError( 1388 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1389 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1390 ) 1391 1392 # When step is not defined, default to a step size from the starting date to the present moment 1393 step_length = datetime.timedelta.max 1394 interpolated_step = ( 1395 InterpolatedString.create( 1396 datetime_based_cursor_model.step, 1397 parameters=datetime_based_cursor_model.parameters or {}, 1398 ) 1399 if datetime_based_cursor_model.step 1400 else None 1401 ) 1402 if interpolated_step: 1403 evaluated_step = interpolated_step.eval(config) 1404 if evaluated_step: 1405 step_length = parse_duration(evaluated_step) 1406 1407 clamping_strategy: ClampingStrategy = NoClamping() 1408 if datetime_based_cursor_model.clamping: 1409 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1410 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1411 # object which we want to keep agnostic of being low-code 1412 target = InterpolatedString( 1413 string=datetime_based_cursor_model.clamping.target, 1414 parameters=datetime_based_cursor_model.parameters or {}, 1415 ) 1416 evaluated_target = target.eval(config=config) 1417 match evaluated_target: 1418 case "DAY": 1419 clamping_strategy = DayClampingStrategy() 1420 end_date_provider = ClampingEndProvider( 1421 DayClampingStrategy(is_ceiling=False), 1422 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1423 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1424 ) 1425 case "WEEK": 1426 if ( 1427 not datetime_based_cursor_model.clamping.target_details 1428 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1429 ): 1430 raise ValueError( 1431 "Given WEEK clamping, weekday needs to be provided as target_details" 1432 ) 1433 weekday = self._assemble_weekday( 1434 datetime_based_cursor_model.clamping.target_details["weekday"] 1435 ) 1436 clamping_strategy = WeekClampingStrategy(weekday) 1437 end_date_provider = ClampingEndProvider( 1438 WeekClampingStrategy(weekday, is_ceiling=False), 1439 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1440 granularity=cursor_granularity or datetime.timedelta(days=1), 1441 ) 1442 case "MONTH": 1443 clamping_strategy = MonthClampingStrategy() 1444 end_date_provider = ClampingEndProvider( 1445 MonthClampingStrategy(is_ceiling=False), 1446 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1447 granularity=cursor_granularity or datetime.timedelta(days=1), 1448 ) 1449 case _: 1450 raise ValueError( 1451 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1452 ) 1453 1454 return ConcurrentCursor( 1455 stream_name=stream_name, 1456 stream_namespace=stream_namespace, 1457 stream_state=stream_state, 1458 message_repository=message_repository or self._message_repository, 1459 connector_state_manager=self._connector_state_manager, 1460 connector_state_converter=connector_state_converter, 1461 cursor_field=cursor_field, 1462 slice_boundary_fields=slice_boundary_fields, 1463 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1464 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1465 lookback_window=lookback_window, 1466 slice_range=step_length, 1467 cursor_granularity=cursor_granularity, 1468 clamping_strategy=clamping_strategy, 1469 ) 1470 1471 def create_concurrent_cursor_from_incrementing_count_cursor( 1472 self, 1473 model_type: Type[BaseModel], 1474 component_definition: ComponentDefinition, 1475 stream_name: str, 1476 stream_namespace: Optional[str], 1477 config: Config, 1478 message_repository: Optional[MessageRepository] = None, 1479 **kwargs: Any, 1480 ) -> ConcurrentCursor: 1481 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1482 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1483 # incoming state and connector_state_manager that is initialized when the component factory is created 1484 stream_state = ( 1485 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1486 if "stream_state" not in kwargs 1487 else kwargs["stream_state"] 1488 ) 1489 1490 component_type = component_definition.get("type") 1491 if component_definition.get("type") != model_type.__name__: 1492 raise ValueError( 1493 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1494 ) 1495 1496 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1497 1498 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1499 raise ValueError( 1500 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1501 ) 1502 1503 interpolated_start_value = ( 1504 InterpolatedString.create( 1505 incrementing_count_cursor_model.start_value, # type: ignore 1506 parameters=incrementing_count_cursor_model.parameters or {}, 1507 ) 1508 if incrementing_count_cursor_model.start_value 1509 else 0 1510 ) 1511 1512 interpolated_cursor_field = InterpolatedString.create( 1513 incrementing_count_cursor_model.cursor_field, 1514 parameters=incrementing_count_cursor_model.parameters or {}, 1515 ) 1516 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1517 1518 connector_state_converter = IncrementingCountStreamStateConverter( 1519 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1520 ) 1521 1522 return ConcurrentCursor( 1523 stream_name=stream_name, 1524 stream_namespace=stream_namespace, 1525 stream_state=stream_state, 1526 message_repository=message_repository or self._message_repository, 1527 connector_state_manager=self._connector_state_manager, 1528 connector_state_converter=connector_state_converter, 1529 cursor_field=cursor_field, 1530 slice_boundary_fields=None, 1531 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1532 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1533 ) 1534 1535 def _assemble_weekday(self, weekday: str) -> Weekday: 1536 match weekday: 1537 case "MONDAY": 1538 return Weekday.MONDAY 1539 case "TUESDAY": 1540 return Weekday.TUESDAY 1541 case "WEDNESDAY": 1542 return Weekday.WEDNESDAY 1543 case "THURSDAY": 1544 return Weekday.THURSDAY 1545 case "FRIDAY": 1546 return Weekday.FRIDAY 1547 case "SATURDAY": 1548 return Weekday.SATURDAY 1549 case "SUNDAY": 1550 return Weekday.SUNDAY 1551 case _: 1552 raise ValueError(f"Unknown weekday {weekday}") 1553 1554 def create_concurrent_cursor_from_perpartition_cursor( 1555 self, 1556 state_manager: ConnectorStateManager, 1557 model_type: Type[BaseModel], 1558 component_definition: ComponentDefinition, 1559 stream_name: str, 1560 stream_namespace: Optional[str], 1561 config: Config, 1562 stream_state: MutableMapping[str, Any], 1563 partition_router: PartitionRouter, 1564 stream_state_migrations: Optional[List[Any]] = None, 1565 **kwargs: Any, 1566 ) -> ConcurrentPerPartitionCursor: 1567 component_type = component_definition.get("type") 1568 if component_definition.get("type") != model_type.__name__: 1569 raise ValueError( 1570 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1571 ) 1572 1573 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1574 1575 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1576 raise ValueError( 1577 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1578 ) 1579 1580 interpolated_cursor_field = InterpolatedString.create( 1581 datetime_based_cursor_model.cursor_field, 1582 parameters=datetime_based_cursor_model.parameters or {}, 1583 ) 1584 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1585 1586 datetime_format = datetime_based_cursor_model.datetime_format 1587 1588 cursor_granularity = ( 1589 parse_duration(datetime_based_cursor_model.cursor_granularity) 1590 if datetime_based_cursor_model.cursor_granularity 1591 else None 1592 ) 1593 1594 connector_state_converter: DateTimeStreamStateConverter 1595 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1596 datetime_format=datetime_format, 1597 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1598 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1599 cursor_granularity=cursor_granularity, 1600 ) 1601 1602 # Create the cursor factory 1603 cursor_factory = ConcurrentCursorFactory( 1604 partial( 1605 self.create_concurrent_cursor_from_datetime_based_cursor, 1606 state_manager=state_manager, 1607 model_type=model_type, 1608 component_definition=component_definition, 1609 stream_name=stream_name, 1610 stream_namespace=stream_namespace, 1611 config=config, 1612 message_repository=NoopMessageRepository(), 1613 stream_state_migrations=stream_state_migrations, 1614 ) 1615 ) 1616 1617 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1618 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1619 use_global_cursor = isinstance( 1620 partition_router, GroupingPartitionRouter 1621 ) or component_definition.get("global_substream_cursor", False) 1622 1623 # Return the concurrent cursor and state converter 1624 return ConcurrentPerPartitionCursor( 1625 cursor_factory=cursor_factory, 1626 partition_router=partition_router, 1627 stream_name=stream_name, 1628 stream_namespace=stream_namespace, 1629 stream_state=stream_state, 1630 message_repository=self._message_repository, # type: ignore 1631 connector_state_manager=state_manager, 1632 connector_state_converter=connector_state_converter, 1633 cursor_field=cursor_field, 1634 use_global_cursor=use_global_cursor, 1635 ) 1636 1637 @staticmethod 1638 def create_constant_backoff_strategy( 1639 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1640 ) -> ConstantBackoffStrategy: 1641 return ConstantBackoffStrategy( 1642 backoff_time_in_seconds=model.backoff_time_in_seconds, 1643 config=config, 1644 parameters=model.parameters or {}, 1645 ) 1646 1647 def create_cursor_pagination( 1648 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1649 ) -> CursorPaginationStrategy: 1650 if isinstance(decoder, PaginationDecoderDecorator): 1651 inner_decoder = decoder.decoder 1652 else: 1653 inner_decoder = decoder 1654 decoder = PaginationDecoderDecorator(decoder=decoder) 1655 1656 if self._is_supported_decoder_for_pagination(inner_decoder): 1657 decoder_to_use = decoder 1658 else: 1659 raise ValueError( 1660 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1661 ) 1662 1663 return CursorPaginationStrategy( 1664 cursor_value=model.cursor_value, 1665 decoder=decoder_to_use, 1666 page_size=model.page_size, 1667 stop_condition=model.stop_condition, 1668 config=config, 1669 parameters=model.parameters or {}, 1670 ) 1671 1672 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1673 """ 1674 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1675 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1676 :param model: The Pydantic model of the custom component being created 1677 :param config: The custom defined connector config 1678 :return: The declarative component built from the Pydantic model to be used at runtime 1679 """ 1680 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1681 component_fields = get_type_hints(custom_component_class) 1682 model_args = model.dict() 1683 model_args["config"] = config 1684 1685 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1686 # we defer to these arguments over the component's definition 1687 for key, arg in kwargs.items(): 1688 model_args[key] = arg 1689 1690 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1691 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1692 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1693 for model_field, model_value in model_args.items(): 1694 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1695 if ( 1696 isinstance(model_value, dict) 1697 and "type" not in model_value 1698 and model_field in component_fields 1699 ): 1700 derived_type = self._derive_component_type_from_type_hints( 1701 component_fields.get(model_field) 1702 ) 1703 if derived_type: 1704 model_value["type"] = derived_type 1705 1706 if self._is_component(model_value): 1707 model_args[model_field] = self._create_nested_component( 1708 model, model_field, model_value, config 1709 ) 1710 elif isinstance(model_value, list): 1711 vals = [] 1712 for v in model_value: 1713 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1714 derived_type = self._derive_component_type_from_type_hints( 1715 component_fields.get(model_field) 1716 ) 1717 if derived_type: 1718 v["type"] = derived_type 1719 if self._is_component(v): 1720 vals.append(self._create_nested_component(model, model_field, v, config)) 1721 else: 1722 vals.append(v) 1723 model_args[model_field] = vals 1724 1725 kwargs = { 1726 class_field: model_args[class_field] 1727 for class_field in component_fields.keys() 1728 if class_field in model_args 1729 } 1730 return custom_component_class(**kwargs) 1731 1732 @staticmethod 1733 def _get_class_from_fully_qualified_class_name( 1734 full_qualified_class_name: str, 1735 ) -> Any: 1736 """Get a class from its fully qualified name. 1737 1738 If a custom components module is needed, we assume it is already registered - probably 1739 as `source_declarative_manifest.components` or `components`. 1740 1741 Args: 1742 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1743 1744 Returns: 1745 Any: The class object. 1746 1747 Raises: 1748 ValueError: If the class cannot be loaded. 1749 """ 1750 split = full_qualified_class_name.split(".") 1751 module_name_full = ".".join(split[:-1]) 1752 class_name = split[-1] 1753 1754 try: 1755 module_ref = importlib.import_module(module_name_full) 1756 except ModuleNotFoundError as e: 1757 if split[0] == "source_declarative_manifest": 1758 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1759 try: 1760 import os 1761 1762 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1763 module_ref = importlib.import_module( 1764 module_name_with_source_declarative_manifest 1765 ) 1766 except ModuleNotFoundError: 1767 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1768 else: 1769 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1770 1771 try: 1772 return getattr(module_ref, class_name) 1773 except AttributeError as e: 1774 raise ValueError( 1775 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1776 ) from e 1777 1778 @staticmethod 1779 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1780 interface = field_type 1781 while True: 1782 origin = get_origin(interface) 1783 if origin: 1784 # Unnest types until we reach the raw type 1785 # List[T] -> T 1786 # Optional[List[T]] -> T 1787 args = get_args(interface) 1788 interface = args[0] 1789 else: 1790 break 1791 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1792 return interface.__name__ 1793 return None 1794 1795 @staticmethod 1796 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1797 if not cls: 1798 return False 1799 return cls.__module__ == "builtins" 1800 1801 @staticmethod 1802 def _extract_missing_parameters(error: TypeError) -> List[str]: 1803 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1804 if parameter_search: 1805 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1806 else: 1807 return [] 1808 1809 def _create_nested_component( 1810 self, model: Any, model_field: str, model_value: Any, config: Config 1811 ) -> Any: 1812 type_name = model_value.get("type", None) 1813 if not type_name: 1814 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1815 return model_value 1816 1817 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1818 if model_type: 1819 parsed_model = model_type.parse_obj(model_value) 1820 try: 1821 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1822 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1823 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1824 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1825 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1826 # are needed by a component and could not be shared. 1827 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1828 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1829 model_parameters = model_value.get("$parameters", {}) 1830 matching_parameters = { 1831 kwarg: model_parameters[kwarg] 1832 for kwarg in constructor_kwargs 1833 if kwarg in model_parameters 1834 } 1835 return self._create_component_from_model( 1836 model=parsed_model, config=config, **matching_parameters 1837 ) 1838 except TypeError as error: 1839 missing_parameters = self._extract_missing_parameters(error) 1840 if missing_parameters: 1841 raise ValueError( 1842 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1843 + ", ".join( 1844 ( 1845 f"{type_name}.$parameters.{parameter}" 1846 for parameter in missing_parameters 1847 ) 1848 ) 1849 ) 1850 raise TypeError( 1851 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1852 ) 1853 else: 1854 raise ValueError( 1855 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1856 ) 1857 1858 @staticmethod 1859 def _is_component(model_value: Any) -> bool: 1860 return isinstance(model_value, dict) and model_value.get("type") is not None 1861 1862 def create_datetime_based_cursor( 1863 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1864 ) -> DatetimeBasedCursor: 1865 start_datetime: Union[str, MinMaxDatetime] = ( 1866 model.start_datetime 1867 if isinstance(model.start_datetime, str) 1868 else self.create_min_max_datetime(model.start_datetime, config) 1869 ) 1870 end_datetime: Union[str, MinMaxDatetime, None] = None 1871 if model.is_data_feed and model.end_datetime: 1872 raise ValueError("Data feed does not support end_datetime") 1873 if model.is_data_feed and model.is_client_side_incremental: 1874 raise ValueError( 1875 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1876 ) 1877 if model.end_datetime: 1878 end_datetime = ( 1879 model.end_datetime 1880 if isinstance(model.end_datetime, str) 1881 else self.create_min_max_datetime(model.end_datetime, config) 1882 ) 1883 1884 end_time_option = ( 1885 self._create_component_from_model( 1886 model.end_time_option, config, parameters=model.parameters or {} 1887 ) 1888 if model.end_time_option 1889 else None 1890 ) 1891 start_time_option = ( 1892 self._create_component_from_model( 1893 model.start_time_option, config, parameters=model.parameters or {} 1894 ) 1895 if model.start_time_option 1896 else None 1897 ) 1898 1899 return DatetimeBasedCursor( 1900 cursor_field=model.cursor_field, 1901 cursor_datetime_formats=model.cursor_datetime_formats 1902 if model.cursor_datetime_formats 1903 else [], 1904 cursor_granularity=model.cursor_granularity, 1905 datetime_format=model.datetime_format, 1906 end_datetime=end_datetime, 1907 start_datetime=start_datetime, 1908 step=model.step, 1909 end_time_option=end_time_option, 1910 lookback_window=model.lookback_window, 1911 start_time_option=start_time_option, 1912 partition_field_end=model.partition_field_end, 1913 partition_field_start=model.partition_field_start, 1914 message_repository=self._message_repository, 1915 is_compare_strictly=model.is_compare_strictly, 1916 config=config, 1917 parameters=model.parameters or {}, 1918 ) 1919 1920 def create_declarative_stream( 1921 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1922 ) -> DeclarativeStream: 1923 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1924 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1925 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1926 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1927 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1928 1929 primary_key = model.primary_key.__root__ if model.primary_key else None 1930 stop_condition_on_cursor = ( 1931 model.incremental_sync 1932 and hasattr(model.incremental_sync, "is_data_feed") 1933 and model.incremental_sync.is_data_feed 1934 ) 1935 client_side_incremental_sync = None 1936 if ( 1937 model.incremental_sync 1938 and hasattr(model.incremental_sync, "is_client_side_incremental") 1939 and model.incremental_sync.is_client_side_incremental 1940 ): 1941 supported_slicers = ( 1942 DatetimeBasedCursor, 1943 GlobalSubstreamCursor, 1944 PerPartitionWithGlobalCursor, 1945 ) 1946 if combined_slicers and not isinstance(combined_slicers, supported_slicers): 1947 raise ValueError( 1948 "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead" 1949 ) 1950 cursor = ( 1951 combined_slicers 1952 if isinstance( 1953 combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor) 1954 ) 1955 else self._create_component_from_model(model=model.incremental_sync, config=config) 1956 ) 1957 1958 client_side_incremental_sync = {"cursor": cursor} 1959 1960 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1961 cursor_model = model.incremental_sync 1962 1963 end_time_option = ( 1964 self._create_component_from_model( 1965 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1966 ) 1967 if cursor_model.end_time_option 1968 else None 1969 ) 1970 start_time_option = ( 1971 self._create_component_from_model( 1972 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1973 ) 1974 if cursor_model.start_time_option 1975 else None 1976 ) 1977 1978 request_options_provider = DatetimeBasedRequestOptionsProvider( 1979 start_time_option=start_time_option, 1980 end_time_option=end_time_option, 1981 partition_field_start=cursor_model.partition_field_end, 1982 partition_field_end=cursor_model.partition_field_end, 1983 config=config, 1984 parameters=model.parameters or {}, 1985 ) 1986 elif model.incremental_sync and isinstance( 1987 model.incremental_sync, IncrementingCountCursorModel 1988 ): 1989 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1990 1991 start_time_option = ( 1992 self._create_component_from_model( 1993 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1994 config, 1995 parameters=cursor_model.parameters or {}, 1996 ) 1997 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1998 else None 1999 ) 2000 2001 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2002 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2003 partition_field_start = "start" 2004 2005 request_options_provider = DatetimeBasedRequestOptionsProvider( 2006 start_time_option=start_time_option, 2007 partition_field_start=partition_field_start, 2008 config=config, 2009 parameters=model.parameters or {}, 2010 ) 2011 else: 2012 request_options_provider = None 2013 2014 transformations = [] 2015 if model.transformations: 2016 for transformation_model in model.transformations: 2017 transformations.append( 2018 self._create_component_from_model(model=transformation_model, config=config) 2019 ) 2020 file_uploader = None 2021 if model.file_uploader: 2022 file_uploader = self._create_component_from_model( 2023 model=model.file_uploader, config=config 2024 ) 2025 2026 retriever = self._create_component_from_model( 2027 model=model.retriever, 2028 config=config, 2029 name=model.name, 2030 primary_key=primary_key, 2031 stream_slicer=combined_slicers, 2032 request_options_provider=request_options_provider, 2033 stop_condition_on_cursor=stop_condition_on_cursor, 2034 client_side_incremental_sync=client_side_incremental_sync, 2035 transformations=transformations, 2036 file_uploader=file_uploader, 2037 incremental_sync=model.incremental_sync, 2038 ) 2039 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 2040 2041 if model.state_migrations: 2042 state_transformations = [ 2043 self._create_component_from_model(state_migration, config, declarative_stream=model) 2044 for state_migration in model.state_migrations 2045 ] 2046 else: 2047 state_transformations = [] 2048 2049 schema_loader: Union[ 2050 CompositeSchemaLoader, 2051 DefaultSchemaLoader, 2052 DynamicSchemaLoader, 2053 InlineSchemaLoader, 2054 JsonFileSchemaLoader, 2055 ] 2056 if model.schema_loader and isinstance(model.schema_loader, list): 2057 nested_schema_loaders = [ 2058 self._create_component_from_model(model=nested_schema_loader, config=config) 2059 for nested_schema_loader in model.schema_loader 2060 ] 2061 schema_loader = CompositeSchemaLoader( 2062 schema_loaders=nested_schema_loaders, parameters={} 2063 ) 2064 elif model.schema_loader: 2065 schema_loader = self._create_component_from_model( 2066 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2067 config=config, 2068 ) 2069 else: 2070 options = model.parameters or {} 2071 if "name" not in options: 2072 options["name"] = model.name 2073 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2074 2075 return DeclarativeStream( 2076 name=model.name or "", 2077 primary_key=primary_key, 2078 retriever=retriever, 2079 schema_loader=schema_loader, 2080 stream_cursor_field=cursor_field or "", 2081 state_migrations=state_transformations, 2082 config=config, 2083 parameters=model.parameters or {}, 2084 ) 2085 2086 def _build_stream_slicer_from_partition_router( 2087 self, 2088 model: Union[ 2089 AsyncRetrieverModel, 2090 CustomRetrieverModel, 2091 SimpleRetrieverModel, 2092 ], 2093 config: Config, 2094 stream_name: Optional[str] = None, 2095 ) -> Optional[PartitionRouter]: 2096 if ( 2097 hasattr(model, "partition_router") 2098 and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel) 2099 and model.partition_router 2100 ): 2101 stream_slicer_model = model.partition_router 2102 if isinstance(stream_slicer_model, list): 2103 return CartesianProductStreamSlicer( 2104 [ 2105 self._create_component_from_model( 2106 model=slicer, config=config, stream_name=stream_name or "" 2107 ) 2108 for slicer in stream_slicer_model 2109 ], 2110 parameters={}, 2111 ) 2112 else: 2113 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2114 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2115 ) 2116 return None 2117 2118 def _build_incremental_cursor( 2119 self, 2120 model: DeclarativeStreamModel, 2121 stream_slicer: Optional[PartitionRouter], 2122 config: Config, 2123 ) -> Optional[StreamSlicer]: 2124 if model.incremental_sync and stream_slicer: 2125 if model.retriever.type == "AsyncRetriever": 2126 stream_name = model.name or "" 2127 stream_namespace = None 2128 stream_state = self._connector_state_manager.get_stream_state( 2129 stream_name, stream_namespace 2130 ) 2131 state_transformations = ( 2132 [ 2133 self._create_component_from_model( 2134 state_migration, config, declarative_stream=model 2135 ) 2136 for state_migration in model.state_migrations 2137 ] 2138 if model.state_migrations 2139 else [] 2140 ) 2141 2142 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2143 state_manager=self._connector_state_manager, 2144 model_type=DatetimeBasedCursorModel, 2145 component_definition=model.incremental_sync.__dict__, 2146 stream_name=stream_name, 2147 stream_namespace=stream_namespace, 2148 config=config or {}, 2149 stream_state=stream_state, 2150 stream_state_migrations=state_transformations, 2151 partition_router=stream_slicer, 2152 ) 2153 2154 incremental_sync_model = model.incremental_sync 2155 cursor_component = self._create_component_from_model( 2156 model=incremental_sync_model, config=config 2157 ) 2158 is_global_cursor = ( 2159 hasattr(incremental_sync_model, "global_substream_cursor") 2160 and incremental_sync_model.global_substream_cursor 2161 ) 2162 2163 if is_global_cursor: 2164 return GlobalSubstreamCursor( 2165 stream_cursor=cursor_component, partition_router=stream_slicer 2166 ) 2167 return PerPartitionWithGlobalCursor( 2168 cursor_factory=CursorFactory( 2169 lambda: self._create_component_from_model( 2170 model=incremental_sync_model, config=config 2171 ), 2172 ), 2173 partition_router=stream_slicer, 2174 stream_cursor=cursor_component, 2175 ) 2176 elif model.incremental_sync: 2177 if model.retriever.type == "AsyncRetriever": 2178 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2179 model_type=DatetimeBasedCursorModel, 2180 component_definition=model.incremental_sync.__dict__, 2181 stream_name=model.name or "", 2182 stream_namespace=None, 2183 config=config or {}, 2184 stream_state_migrations=model.state_migrations, 2185 ) 2186 return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync 2187 return None 2188 2189 def _build_resumable_cursor( 2190 self, 2191 model: Union[ 2192 AsyncRetrieverModel, 2193 CustomRetrieverModel, 2194 SimpleRetrieverModel, 2195 ], 2196 stream_slicer: Optional[PartitionRouter], 2197 ) -> Optional[StreamSlicer]: 2198 if hasattr(model, "paginator") and model.paginator and not stream_slicer: 2199 # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` 2200 return ResumableFullRefreshCursor(parameters={}) 2201 elif stream_slicer: 2202 # For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor` 2203 return PerPartitionCursor( 2204 cursor_factory=CursorFactory( 2205 create_function=partial(ChildPartitionResumableFullRefreshCursor, {}) 2206 ), 2207 partition_router=stream_slicer, 2208 ) 2209 return None 2210 2211 def _merge_stream_slicers( 2212 self, model: DeclarativeStreamModel, config: Config 2213 ) -> Optional[StreamSlicer]: 2214 retriever_model = model.retriever 2215 2216 stream_slicer = self._build_stream_slicer_from_partition_router( 2217 retriever_model, config, stream_name=model.name 2218 ) 2219 2220 if retriever_model.type == "AsyncRetriever": 2221 is_not_datetime_cursor = ( 2222 model.incremental_sync.type != "DatetimeBasedCursor" 2223 if model.incremental_sync 2224 else None 2225 ) 2226 is_partition_router = ( 2227 bool(retriever_model.partition_router) if model.incremental_sync else None 2228 ) 2229 2230 if is_not_datetime_cursor: 2231 # We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the 2232 # support or unordered slices (for example, when we trigger reports for January and February, the report 2233 # in February can be completed first). Once we have support for custom concurrent cursor or have a new 2234 # implementation available in the CDK, we can enable more cursors here. 2235 raise ValueError( 2236 "AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet." 2237 ) 2238 2239 if is_partition_router and not stream_slicer: 2240 # Note that this development is also done in parallel to the per partition development which once merged 2241 # we could support here by calling create_concurrent_cursor_from_perpartition_cursor 2242 raise ValueError("Per partition state is not supported yet for AsyncRetriever.") 2243 2244 if model.incremental_sync: 2245 return self._build_incremental_cursor(model, stream_slicer, config) 2246 2247 return ( 2248 stream_slicer 2249 if self._disable_resumable_full_refresh 2250 else self._build_resumable_cursor(retriever_model, stream_slicer) 2251 ) 2252 2253 def create_default_error_handler( 2254 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2255 ) -> DefaultErrorHandler: 2256 backoff_strategies = [] 2257 if model.backoff_strategies: 2258 for backoff_strategy_model in model.backoff_strategies: 2259 backoff_strategies.append( 2260 self._create_component_from_model(model=backoff_strategy_model, config=config) 2261 ) 2262 2263 response_filters = [] 2264 if model.response_filters: 2265 for response_filter_model in model.response_filters: 2266 response_filters.append( 2267 self._create_component_from_model(model=response_filter_model, config=config) 2268 ) 2269 response_filters.append( 2270 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2271 ) 2272 2273 return DefaultErrorHandler( 2274 backoff_strategies=backoff_strategies, 2275 max_retries=model.max_retries, 2276 response_filters=response_filters, 2277 config=config, 2278 parameters=model.parameters or {}, 2279 ) 2280 2281 def create_default_paginator( 2282 self, 2283 model: DefaultPaginatorModel, 2284 config: Config, 2285 *, 2286 url_base: str, 2287 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2288 decoder: Optional[Decoder] = None, 2289 cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None, 2290 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2291 if decoder: 2292 if self._is_supported_decoder_for_pagination(decoder): 2293 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2294 else: 2295 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2296 else: 2297 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2298 page_size_option = ( 2299 self._create_component_from_model(model=model.page_size_option, config=config) 2300 if model.page_size_option 2301 else None 2302 ) 2303 page_token_option = ( 2304 self._create_component_from_model(model=model.page_token_option, config=config) 2305 if model.page_token_option 2306 else None 2307 ) 2308 pagination_strategy = self._create_component_from_model( 2309 model=model.pagination_strategy, 2310 config=config, 2311 decoder=decoder_to_use, 2312 extractor_model=extractor_model, 2313 ) 2314 if cursor_used_for_stop_condition: 2315 pagination_strategy = StopConditionPaginationStrategyDecorator( 2316 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2317 ) 2318 paginator = DefaultPaginator( 2319 decoder=decoder_to_use, 2320 page_size_option=page_size_option, 2321 page_token_option=page_token_option, 2322 pagination_strategy=pagination_strategy, 2323 url_base=url_base, 2324 config=config, 2325 parameters=model.parameters or {}, 2326 ) 2327 if self._limit_pages_fetched_per_slice: 2328 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2329 return paginator 2330 2331 def create_dpath_extractor( 2332 self, 2333 model: DpathExtractorModel, 2334 config: Config, 2335 decoder: Optional[Decoder] = None, 2336 **kwargs: Any, 2337 ) -> DpathExtractor: 2338 if decoder: 2339 decoder_to_use = decoder 2340 else: 2341 decoder_to_use = JsonDecoder(parameters={}) 2342 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2343 return DpathExtractor( 2344 decoder=decoder_to_use, 2345 field_path=model_field_path, 2346 config=config, 2347 parameters=model.parameters or {}, 2348 ) 2349 2350 @staticmethod 2351 def create_response_to_file_extractor( 2352 model: ResponseToFileExtractorModel, 2353 **kwargs: Any, 2354 ) -> ResponseToFileExtractor: 2355 return ResponseToFileExtractor(parameters=model.parameters or {}) 2356 2357 @staticmethod 2358 def create_exponential_backoff_strategy( 2359 model: ExponentialBackoffStrategyModel, config: Config 2360 ) -> ExponentialBackoffStrategy: 2361 return ExponentialBackoffStrategy( 2362 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2363 ) 2364 2365 @staticmethod 2366 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2367 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2368 2369 def create_http_requester( 2370 self, 2371 model: HttpRequesterModel, 2372 config: Config, 2373 decoder: Decoder = JsonDecoder(parameters={}), 2374 query_properties_key: Optional[str] = None, 2375 use_cache: Optional[bool] = None, 2376 *, 2377 name: str, 2378 ) -> HttpRequester: 2379 authenticator = ( 2380 self._create_component_from_model( 2381 model=model.authenticator, 2382 config=config, 2383 url_base=model.url or model.url_base, 2384 name=name, 2385 decoder=decoder, 2386 ) 2387 if model.authenticator 2388 else None 2389 ) 2390 error_handler = ( 2391 self._create_component_from_model(model=model.error_handler, config=config) 2392 if model.error_handler 2393 else DefaultErrorHandler( 2394 backoff_strategies=[], 2395 response_filters=[], 2396 config=config, 2397 parameters=model.parameters or {}, 2398 ) 2399 ) 2400 2401 api_budget = self._api_budget 2402 2403 # Removes QueryProperties components from the interpolated mappings because it has been designed 2404 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2405 # instead of through jinja interpolation 2406 request_parameters: Optional[Union[str, Mapping[str, str]]] 2407 if isinstance(model.request_parameters, Mapping): 2408 request_parameters = self._remove_query_properties(model.request_parameters) 2409 else: 2410 request_parameters = model.request_parameters 2411 2412 request_options_provider = InterpolatedRequestOptionsProvider( 2413 request_body=model.request_body, 2414 request_body_data=model.request_body_data, 2415 request_body_json=model.request_body_json, 2416 request_headers=model.request_headers, 2417 request_parameters=request_parameters, 2418 query_properties_key=query_properties_key, 2419 config=config, 2420 parameters=model.parameters or {}, 2421 ) 2422 2423 assert model.use_cache is not None # for mypy 2424 assert model.http_method is not None # for mypy 2425 2426 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2427 2428 return HttpRequester( 2429 name=name, 2430 url=model.url, 2431 url_base=model.url_base, 2432 path=model.path, 2433 authenticator=authenticator, 2434 error_handler=error_handler, 2435 api_budget=api_budget, 2436 http_method=HttpMethod[model.http_method.value], 2437 request_options_provider=request_options_provider, 2438 config=config, 2439 disable_retries=self._disable_retries, 2440 parameters=model.parameters or {}, 2441 message_repository=self._message_repository, 2442 use_cache=should_use_cache, 2443 decoder=decoder, 2444 stream_response=decoder.is_stream_response() if decoder else False, 2445 ) 2446 2447 @staticmethod 2448 def create_http_response_filter( 2449 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2450 ) -> HttpResponseFilter: 2451 if model.action: 2452 action = ResponseAction(model.action.value) 2453 else: 2454 action = None 2455 2456 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2457 2458 http_codes = ( 2459 set(model.http_codes) if model.http_codes else set() 2460 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2461 2462 return HttpResponseFilter( 2463 action=action, 2464 failure_type=failure_type, 2465 error_message=model.error_message or "", 2466 error_message_contains=model.error_message_contains or "", 2467 http_codes=http_codes, 2468 predicate=model.predicate or "", 2469 config=config, 2470 parameters=model.parameters or {}, 2471 ) 2472 2473 @staticmethod 2474 def create_inline_schema_loader( 2475 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2476 ) -> InlineSchemaLoader: 2477 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2478 2479 def create_complex_field_type( 2480 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2481 ) -> ComplexFieldType: 2482 items = ( 2483 self._create_component_from_model(model=model.items, config=config) 2484 if isinstance(model.items, ComplexFieldTypeModel) 2485 else model.items 2486 ) 2487 2488 return ComplexFieldType(field_type=model.field_type, items=items) 2489 2490 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2491 target_type = ( 2492 self._create_component_from_model(model=model.target_type, config=config) 2493 if isinstance(model.target_type, ComplexFieldTypeModel) 2494 else model.target_type 2495 ) 2496 2497 return TypesMap( 2498 target_type=target_type, 2499 current_type=model.current_type, 2500 condition=model.condition if model.condition is not None else "True", 2501 ) 2502 2503 def create_schema_type_identifier( 2504 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2505 ) -> SchemaTypeIdentifier: 2506 types_mapping = [] 2507 if model.types_mapping: 2508 types_mapping.extend( 2509 [ 2510 self._create_component_from_model(types_map, config=config) 2511 for types_map in model.types_mapping 2512 ] 2513 ) 2514 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2515 [x for x in model.schema_pointer] if model.schema_pointer else [] 2516 ) 2517 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2518 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2519 [x for x in model.type_pointer] if model.type_pointer else None 2520 ) 2521 2522 return SchemaTypeIdentifier( 2523 schema_pointer=model_schema_pointer, 2524 key_pointer=model_key_pointer, 2525 type_pointer=model_type_pointer, 2526 types_mapping=types_mapping, 2527 parameters=model.parameters or {}, 2528 ) 2529 2530 def create_dynamic_schema_loader( 2531 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2532 ) -> DynamicSchemaLoader: 2533 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2534 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2535 2536 schema_transformations = [] 2537 if model.schema_transformations: 2538 for transformation_model in model.schema_transformations: 2539 schema_transformations.append( 2540 self._create_component_from_model(model=transformation_model, config=config) 2541 ) 2542 name = "dynamic_properties" 2543 retriever = self._create_component_from_model( 2544 model=model.retriever, 2545 config=config, 2546 name=name, 2547 primary_key=None, 2548 stream_slicer=combined_slicers, 2549 transformations=[], 2550 use_cache=True, 2551 log_formatter=( 2552 lambda response: format_http_message( 2553 response, 2554 f"Schema loader '{name}' request", 2555 f"Request performed in order to extract schema.", 2556 name, 2557 is_auxiliary=True, 2558 ) 2559 ), 2560 ) 2561 schema_type_identifier = self._create_component_from_model( 2562 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2563 ) 2564 schema_filter = ( 2565 self._create_component_from_model( 2566 model.schema_filter, config=config, parameters=model.parameters or {} 2567 ) 2568 if model.schema_filter is not None 2569 else None 2570 ) 2571 2572 return DynamicSchemaLoader( 2573 retriever=retriever, 2574 config=config, 2575 schema_transformations=schema_transformations, 2576 schema_filter=schema_filter, 2577 schema_type_identifier=schema_type_identifier, 2578 parameters=model.parameters or {}, 2579 ) 2580 2581 @staticmethod 2582 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2583 return JsonDecoder(parameters={}) 2584 2585 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2586 return CompositeRawDecoder( 2587 parser=ModelToComponentFactory._get_parser(model, config), 2588 stream_response=False if self._emit_connector_builder_messages else True, 2589 ) 2590 2591 def create_jsonl_decoder( 2592 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2593 ) -> Decoder: 2594 return CompositeRawDecoder( 2595 parser=ModelToComponentFactory._get_parser(model, config), 2596 stream_response=False if self._emit_connector_builder_messages else True, 2597 ) 2598 2599 def create_gzip_decoder( 2600 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2601 ) -> Decoder: 2602 _compressed_response_types = { 2603 "gzip", 2604 "x-gzip", 2605 "gzip, deflate", 2606 "x-gzip, deflate", 2607 "application/zip", 2608 "application/gzip", 2609 "application/x-gzip", 2610 "application/x-zip-compressed", 2611 } 2612 2613 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2614 2615 if self._emit_connector_builder_messages: 2616 # This is very surprising but if the response is not streamed, 2617 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2618 # which uses urllib3 directly and does not uncompress the data. 2619 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2620 2621 return CompositeRawDecoder.by_headers( 2622 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2623 stream_response=True, 2624 fallback_parser=gzip_parser.inner_parser, 2625 ) 2626 2627 @staticmethod 2628 def create_incrementing_count_cursor( 2629 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2630 ) -> DatetimeBasedCursor: 2631 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2632 # we still parse models into components. The issue is that there's no runtime implementation of a 2633 # IncrementingCountCursor. 2634 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2635 return DatetimeBasedCursor( 2636 cursor_field=model.cursor_field, 2637 datetime_format="%Y-%m-%d", 2638 start_datetime="2024-12-12", 2639 config=config, 2640 parameters={}, 2641 ) 2642 2643 @staticmethod 2644 def create_iterable_decoder( 2645 model: IterableDecoderModel, config: Config, **kwargs: Any 2646 ) -> IterableDecoder: 2647 return IterableDecoder(parameters={}) 2648 2649 @staticmethod 2650 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2651 return XmlDecoder(parameters={}) 2652 2653 def create_zipfile_decoder( 2654 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2655 ) -> ZipfileDecoder: 2656 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2657 2658 @staticmethod 2659 def _get_parser(model: BaseModel, config: Config) -> Parser: 2660 if isinstance(model, JsonDecoderModel): 2661 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2662 return JsonParser() 2663 elif isinstance(model, JsonlDecoderModel): 2664 return JsonLineParser() 2665 elif isinstance(model, CsvDecoderModel): 2666 return CsvParser( 2667 encoding=model.encoding, 2668 delimiter=model.delimiter, 2669 set_values_to_none=model.set_values_to_none, 2670 ) 2671 elif isinstance(model, GzipDecoderModel): 2672 return GzipParser( 2673 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2674 ) 2675 elif isinstance( 2676 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2677 ): 2678 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2679 2680 raise ValueError(f"Unknown decoder type {model}") 2681 2682 @staticmethod 2683 def create_json_file_schema_loader( 2684 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2685 ) -> JsonFileSchemaLoader: 2686 return JsonFileSchemaLoader( 2687 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2688 ) 2689 2690 @staticmethod 2691 def create_jwt_authenticator( 2692 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2693 ) -> JwtAuthenticator: 2694 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2695 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2696 return JwtAuthenticator( 2697 config=config, 2698 parameters=model.parameters or {}, 2699 algorithm=JwtAlgorithm(model.algorithm.value), 2700 secret_key=model.secret_key, 2701 base64_encode_secret_key=model.base64_encode_secret_key, 2702 token_duration=model.token_duration, 2703 header_prefix=model.header_prefix, 2704 kid=jwt_headers.kid, 2705 typ=jwt_headers.typ, 2706 cty=jwt_headers.cty, 2707 iss=jwt_payload.iss, 2708 sub=jwt_payload.sub, 2709 aud=jwt_payload.aud, 2710 additional_jwt_headers=model.additional_jwt_headers, 2711 additional_jwt_payload=model.additional_jwt_payload, 2712 ) 2713 2714 def create_list_partition_router( 2715 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2716 ) -> ListPartitionRouter: 2717 request_option = ( 2718 self._create_component_from_model(model.request_option, config) 2719 if model.request_option 2720 else None 2721 ) 2722 return ListPartitionRouter( 2723 cursor_field=model.cursor_field, 2724 request_option=request_option, 2725 values=model.values, 2726 config=config, 2727 parameters=model.parameters or {}, 2728 ) 2729 2730 @staticmethod 2731 def create_min_max_datetime( 2732 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2733 ) -> MinMaxDatetime: 2734 return MinMaxDatetime( 2735 datetime=model.datetime, 2736 datetime_format=model.datetime_format or "", 2737 max_datetime=model.max_datetime or "", 2738 min_datetime=model.min_datetime or "", 2739 parameters=model.parameters or {}, 2740 ) 2741 2742 @staticmethod 2743 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2744 return NoAuth(parameters=model.parameters or {}) 2745 2746 @staticmethod 2747 def create_no_pagination( 2748 model: NoPaginationModel, config: Config, **kwargs: Any 2749 ) -> NoPagination: 2750 return NoPagination(parameters={}) 2751 2752 def create_oauth_authenticator( 2753 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2754 ) -> DeclarativeOauth2Authenticator: 2755 profile_assertion = ( 2756 self._create_component_from_model(model.profile_assertion, config=config) 2757 if model.profile_assertion 2758 else None 2759 ) 2760 2761 if model.refresh_token_updater: 2762 # ignore type error because fixing it would have a lot of dependencies, revisit later 2763 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2764 config, 2765 InterpolatedString.create( 2766 model.token_refresh_endpoint, # type: ignore 2767 parameters=model.parameters or {}, 2768 ).eval(config), 2769 access_token_name=InterpolatedString.create( 2770 model.access_token_name or "access_token", parameters=model.parameters or {} 2771 ).eval(config), 2772 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2773 expires_in_name=InterpolatedString.create( 2774 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2775 ).eval(config), 2776 client_id_name=InterpolatedString.create( 2777 model.client_id_name or "client_id", parameters=model.parameters or {} 2778 ).eval(config), 2779 client_id=InterpolatedString.create( 2780 model.client_id, parameters=model.parameters or {} 2781 ).eval(config) 2782 if model.client_id 2783 else model.client_id, 2784 client_secret_name=InterpolatedString.create( 2785 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2786 ).eval(config), 2787 client_secret=InterpolatedString.create( 2788 model.client_secret, parameters=model.parameters or {} 2789 ).eval(config) 2790 if model.client_secret 2791 else model.client_secret, 2792 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2793 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2794 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2795 grant_type_name=InterpolatedString.create( 2796 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2797 ).eval(config), 2798 grant_type=InterpolatedString.create( 2799 model.grant_type or "refresh_token", parameters=model.parameters or {} 2800 ).eval(config), 2801 refresh_request_body=InterpolatedMapping( 2802 model.refresh_request_body or {}, parameters=model.parameters or {} 2803 ).eval(config), 2804 refresh_request_headers=InterpolatedMapping( 2805 model.refresh_request_headers or {}, parameters=model.parameters or {} 2806 ).eval(config), 2807 scopes=model.scopes, 2808 token_expiry_date_format=model.token_expiry_date_format, 2809 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2810 message_repository=self._message_repository, 2811 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2812 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2813 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2814 ) 2815 # ignore type error because fixing it would have a lot of dependencies, revisit later 2816 return DeclarativeOauth2Authenticator( # type: ignore 2817 access_token_name=model.access_token_name or "access_token", 2818 access_token_value=model.access_token_value, 2819 client_id_name=model.client_id_name or "client_id", 2820 client_id=model.client_id, 2821 client_secret_name=model.client_secret_name or "client_secret", 2822 client_secret=model.client_secret, 2823 expires_in_name=model.expires_in_name or "expires_in", 2824 grant_type_name=model.grant_type_name or "grant_type", 2825 grant_type=model.grant_type or "refresh_token", 2826 refresh_request_body=model.refresh_request_body, 2827 refresh_request_headers=model.refresh_request_headers, 2828 refresh_token_name=model.refresh_token_name or "refresh_token", 2829 refresh_token=model.refresh_token, 2830 scopes=model.scopes, 2831 token_expiry_date=model.token_expiry_date, 2832 token_expiry_date_format=model.token_expiry_date_format, 2833 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2834 token_refresh_endpoint=model.token_refresh_endpoint, 2835 config=config, 2836 parameters=model.parameters or {}, 2837 message_repository=self._message_repository, 2838 profile_assertion=profile_assertion, 2839 use_profile_assertion=model.use_profile_assertion, 2840 ) 2841 2842 def create_offset_increment( 2843 self, 2844 model: OffsetIncrementModel, 2845 config: Config, 2846 decoder: Decoder, 2847 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2848 **kwargs: Any, 2849 ) -> OffsetIncrement: 2850 if isinstance(decoder, PaginationDecoderDecorator): 2851 inner_decoder = decoder.decoder 2852 else: 2853 inner_decoder = decoder 2854 decoder = PaginationDecoderDecorator(decoder=decoder) 2855 2856 if self._is_supported_decoder_for_pagination(inner_decoder): 2857 decoder_to_use = decoder 2858 else: 2859 raise ValueError( 2860 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2861 ) 2862 2863 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2864 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2865 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2866 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2867 # When we have more time to investigate we can look into reusing the same component. 2868 extractor = ( 2869 self._create_component_from_model( 2870 model=extractor_model, config=config, decoder=decoder_to_use 2871 ) 2872 if extractor_model 2873 else None 2874 ) 2875 2876 return OffsetIncrement( 2877 page_size=model.page_size, 2878 config=config, 2879 decoder=decoder_to_use, 2880 extractor=extractor, 2881 inject_on_first_request=model.inject_on_first_request or False, 2882 parameters=model.parameters or {}, 2883 ) 2884 2885 @staticmethod 2886 def create_page_increment( 2887 model: PageIncrementModel, config: Config, **kwargs: Any 2888 ) -> PageIncrement: 2889 return PageIncrement( 2890 page_size=model.page_size, 2891 config=config, 2892 start_from_page=model.start_from_page or 0, 2893 inject_on_first_request=model.inject_on_first_request or False, 2894 parameters=model.parameters or {}, 2895 ) 2896 2897 def create_parent_stream_config( 2898 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2899 ) -> ParentStreamConfig: 2900 declarative_stream = self._create_component_from_model( 2901 model.stream, config=config, **kwargs 2902 ) 2903 request_option = ( 2904 self._create_component_from_model(model.request_option, config=config) 2905 if model.request_option 2906 else None 2907 ) 2908 2909 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2910 raise ValueError( 2911 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2912 ) 2913 2914 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2915 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2916 ) 2917 2918 return ParentStreamConfig( 2919 parent_key=model.parent_key, 2920 request_option=request_option, 2921 stream=declarative_stream, 2922 partition_field=model.partition_field, 2923 config=config, 2924 incremental_dependency=model.incremental_dependency or False, 2925 parameters=model.parameters or {}, 2926 extra_fields=model.extra_fields, 2927 lazy_read_pointer=model_lazy_read_pointer, 2928 ) 2929 2930 def create_properties_from_endpoint( 2931 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2932 ) -> PropertiesFromEndpoint: 2933 retriever = self._create_component_from_model( 2934 model=model.retriever, 2935 config=config, 2936 name="dynamic_properties", 2937 primary_key=None, 2938 stream_slicer=None, 2939 transformations=[], 2940 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2941 ) 2942 return PropertiesFromEndpoint( 2943 property_field_path=model.property_field_path, 2944 retriever=retriever, 2945 config=config, 2946 parameters=model.parameters or {}, 2947 ) 2948 2949 def create_property_chunking( 2950 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2951 ) -> PropertyChunking: 2952 record_merge_strategy = ( 2953 self._create_component_from_model( 2954 model=model.record_merge_strategy, config=config, **kwargs 2955 ) 2956 if model.record_merge_strategy 2957 else None 2958 ) 2959 2960 property_limit_type: PropertyLimitType 2961 match model.property_limit_type: 2962 case PropertyLimitTypeModel.property_count: 2963 property_limit_type = PropertyLimitType.property_count 2964 case PropertyLimitTypeModel.characters: 2965 property_limit_type = PropertyLimitType.characters 2966 case _: 2967 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2968 2969 return PropertyChunking( 2970 property_limit_type=property_limit_type, 2971 property_limit=model.property_limit, 2972 record_merge_strategy=record_merge_strategy, 2973 config=config, 2974 parameters=model.parameters or {}, 2975 ) 2976 2977 def create_query_properties( 2978 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2979 ) -> QueryProperties: 2980 if isinstance(model.property_list, list): 2981 property_list = model.property_list 2982 else: 2983 property_list = self._create_component_from_model( 2984 model=model.property_list, config=config, **kwargs 2985 ) 2986 2987 property_chunking = ( 2988 self._create_component_from_model( 2989 model=model.property_chunking, config=config, **kwargs 2990 ) 2991 if model.property_chunking 2992 else None 2993 ) 2994 2995 return QueryProperties( 2996 property_list=property_list, 2997 always_include_properties=model.always_include_properties, 2998 property_chunking=property_chunking, 2999 config=config, 3000 parameters=model.parameters or {}, 3001 ) 3002 3003 @staticmethod 3004 def create_record_filter( 3005 model: RecordFilterModel, config: Config, **kwargs: Any 3006 ) -> RecordFilter: 3007 return RecordFilter( 3008 condition=model.condition or "", config=config, parameters=model.parameters or {} 3009 ) 3010 3011 @staticmethod 3012 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3013 return RequestPath(parameters={}) 3014 3015 @staticmethod 3016 def create_request_option( 3017 model: RequestOptionModel, config: Config, **kwargs: Any 3018 ) -> RequestOption: 3019 inject_into = RequestOptionType(model.inject_into.value) 3020 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3021 [ 3022 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3023 for segment in model.field_path 3024 ] 3025 if model.field_path 3026 else None 3027 ) 3028 field_name = ( 3029 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3030 if model.field_name 3031 else None 3032 ) 3033 return RequestOption( 3034 field_name=field_name, 3035 field_path=field_path, 3036 inject_into=inject_into, 3037 parameters=kwargs.get("parameters", {}), 3038 ) 3039 3040 def create_record_selector( 3041 self, 3042 model: RecordSelectorModel, 3043 config: Config, 3044 *, 3045 name: str, 3046 transformations: List[RecordTransformation] | None = None, 3047 decoder: Decoder | None = None, 3048 client_side_incremental_sync: Dict[str, Any] | None = None, 3049 file_uploader: Optional[DefaultFileUploader] = None, 3050 **kwargs: Any, 3051 ) -> RecordSelector: 3052 extractor = self._create_component_from_model( 3053 model=model.extractor, decoder=decoder, config=config 3054 ) 3055 record_filter = ( 3056 self._create_component_from_model(model.record_filter, config=config) 3057 if model.record_filter 3058 else None 3059 ) 3060 3061 transform_before_filtering = ( 3062 False if model.transform_before_filtering is None else model.transform_before_filtering 3063 ) 3064 if client_side_incremental_sync: 3065 record_filter = ClientSideIncrementalRecordFilterDecorator( 3066 config=config, 3067 parameters=model.parameters, 3068 condition=model.record_filter.condition 3069 if (model.record_filter and hasattr(model.record_filter, "condition")) 3070 else None, 3071 **client_side_incremental_sync, 3072 ) 3073 transform_before_filtering = ( 3074 True 3075 if model.transform_before_filtering is None 3076 else model.transform_before_filtering 3077 ) 3078 3079 if model.schema_normalization is None: 3080 # default to no schema normalization if not set 3081 model.schema_normalization = SchemaNormalizationModel.None_ 3082 3083 schema_normalization = ( 3084 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3085 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3086 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3087 ) 3088 3089 return RecordSelector( 3090 extractor=extractor, 3091 name=name, 3092 config=config, 3093 record_filter=record_filter, 3094 transformations=transformations or [], 3095 file_uploader=file_uploader, 3096 schema_normalization=schema_normalization, 3097 parameters=model.parameters or {}, 3098 transform_before_filtering=transform_before_filtering, 3099 ) 3100 3101 @staticmethod 3102 def create_remove_fields( 3103 model: RemoveFieldsModel, config: Config, **kwargs: Any 3104 ) -> RemoveFields: 3105 return RemoveFields( 3106 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3107 ) 3108 3109 def create_selective_authenticator( 3110 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3111 ) -> DeclarativeAuthenticator: 3112 authenticators = { 3113 name: self._create_component_from_model(model=auth, config=config) 3114 for name, auth in model.authenticators.items() 3115 } 3116 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3117 return SelectiveAuthenticator( # type: ignore[abstract] 3118 config=config, 3119 authenticators=authenticators, 3120 authenticator_selection_path=model.authenticator_selection_path, 3121 **kwargs, 3122 ) 3123 3124 @staticmethod 3125 def create_legacy_session_token_authenticator( 3126 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3127 ) -> LegacySessionTokenAuthenticator: 3128 return LegacySessionTokenAuthenticator( 3129 api_url=url_base, 3130 header=model.header, 3131 login_url=model.login_url, 3132 password=model.password or "", 3133 session_token=model.session_token or "", 3134 session_token_response_key=model.session_token_response_key or "", 3135 username=model.username or "", 3136 validate_session_url=model.validate_session_url, 3137 config=config, 3138 parameters=model.parameters or {}, 3139 ) 3140 3141 def create_simple_retriever( 3142 self, 3143 model: SimpleRetrieverModel, 3144 config: Config, 3145 *, 3146 name: str, 3147 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3148 stream_slicer: Optional[StreamSlicer], 3149 request_options_provider: Optional[RequestOptionsProvider] = None, 3150 stop_condition_on_cursor: bool = False, 3151 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3152 transformations: List[RecordTransformation], 3153 file_uploader: Optional[DefaultFileUploader] = None, 3154 incremental_sync: Optional[ 3155 Union[ 3156 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 3157 ] 3158 ] = None, 3159 use_cache: Optional[bool] = None, 3160 log_formatter: Optional[Callable[[Response], Any]] = None, 3161 **kwargs: Any, 3162 ) -> SimpleRetriever: 3163 def _get_url() -> str: 3164 """ 3165 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3166 This is needed because the URL is not set until the requester is created. 3167 """ 3168 3169 _url: str = ( 3170 model.requester.url 3171 if hasattr(model.requester, "url") and model.requester.url is not None 3172 else requester.get_url() 3173 ) 3174 _url_base: str = ( 3175 model.requester.url_base 3176 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3177 else requester.get_url_base() 3178 ) 3179 3180 return _url or _url_base 3181 3182 decoder = ( 3183 self._create_component_from_model(model=model.decoder, config=config) 3184 if model.decoder 3185 else JsonDecoder(parameters={}) 3186 ) 3187 record_selector = self._create_component_from_model( 3188 model=model.record_selector, 3189 name=name, 3190 config=config, 3191 decoder=decoder, 3192 transformations=transformations, 3193 client_side_incremental_sync=client_side_incremental_sync, 3194 file_uploader=file_uploader, 3195 ) 3196 3197 query_properties: Optional[QueryProperties] = None 3198 query_properties_key: Optional[str] = None 3199 if self._query_properties_in_request_parameters(model.requester): 3200 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3201 # places instead of default to request_parameters which isn't clearly documented 3202 if ( 3203 hasattr(model.requester, "fetch_properties_from_endpoint") 3204 and model.requester.fetch_properties_from_endpoint 3205 ): 3206 raise ValueError( 3207 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3208 ) 3209 3210 query_properties_definitions = [] 3211 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3212 if isinstance(request_parameter, QueryPropertiesModel): 3213 query_properties_key = key 3214 query_properties_definitions.append(request_parameter) 3215 3216 if len(query_properties_definitions) > 1: 3217 raise ValueError( 3218 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3219 ) 3220 3221 if len(query_properties_definitions) == 1: 3222 query_properties = self._create_component_from_model( 3223 model=query_properties_definitions[0], config=config 3224 ) 3225 elif ( 3226 hasattr(model.requester, "fetch_properties_from_endpoint") 3227 and model.requester.fetch_properties_from_endpoint 3228 ): 3229 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3230 query_properties_definition = QueryPropertiesModel( 3231 type="QueryProperties", 3232 property_list=model.requester.fetch_properties_from_endpoint, 3233 always_include_properties=None, 3234 property_chunking=None, 3235 ) # type: ignore # $parameters has a default value 3236 3237 query_properties = self.create_query_properties( 3238 model=query_properties_definition, 3239 config=config, 3240 ) 3241 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3242 query_properties = self.create_query_properties( 3243 model=model.requester.query_properties, 3244 config=config, 3245 ) 3246 3247 requester = self._create_component_from_model( 3248 model=model.requester, 3249 decoder=decoder, 3250 name=name, 3251 query_properties_key=query_properties_key, 3252 use_cache=use_cache, 3253 config=config, 3254 ) 3255 3256 # Define cursor only if per partition or common incremental support is needed 3257 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3258 3259 if ( 3260 not isinstance(stream_slicer, DatetimeBasedCursor) 3261 or type(stream_slicer) is not DatetimeBasedCursor 3262 ): 3263 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3264 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3265 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3266 # request_options_provider 3267 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3268 elif not request_options_provider: 3269 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3270 3271 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3272 if self._should_limit_slices_fetched(): 3273 stream_slicer = cast( 3274 StreamSlicer, 3275 StreamSlicerTestReadDecorator( 3276 wrapped_slicer=stream_slicer, 3277 maximum_number_of_slices=self._limit_slices_fetched or 5, 3278 ), 3279 ) 3280 3281 cursor_used_for_stop_condition = cursor if stop_condition_on_cursor else None 3282 paginator = ( 3283 self._create_component_from_model( 3284 model=model.paginator, 3285 config=config, 3286 url_base=_get_url(), 3287 extractor_model=model.record_selector.extractor, 3288 decoder=decoder, 3289 cursor_used_for_stop_condition=cursor_used_for_stop_condition, 3290 ) 3291 if model.paginator 3292 else NoPagination(parameters={}) 3293 ) 3294 3295 ignore_stream_slicer_parameters_on_paginated_requests = ( 3296 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3297 ) 3298 3299 if ( 3300 model.partition_router 3301 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3302 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3303 and any( 3304 parent_stream_config.lazy_read_pointer 3305 for parent_stream_config in model.partition_router.parent_stream_configs 3306 ) 3307 ): 3308 if incremental_sync: 3309 if incremental_sync.type != "DatetimeBasedCursor": 3310 raise ValueError( 3311 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3312 ) 3313 3314 elif incremental_sync.step or incremental_sync.cursor_granularity: 3315 raise ValueError( 3316 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3317 ) 3318 3319 if model.decoder and model.decoder.type != "JsonDecoder": 3320 raise ValueError( 3321 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3322 ) 3323 3324 return LazySimpleRetriever( 3325 name=name, 3326 paginator=paginator, 3327 primary_key=primary_key, 3328 requester=requester, 3329 record_selector=record_selector, 3330 stream_slicer=stream_slicer, 3331 request_option_provider=request_options_provider, 3332 cursor=cursor, 3333 config=config, 3334 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3335 parameters=model.parameters or {}, 3336 ) 3337 3338 return SimpleRetriever( 3339 name=name, 3340 paginator=paginator, 3341 primary_key=primary_key, 3342 requester=requester, 3343 record_selector=record_selector, 3344 stream_slicer=stream_slicer, 3345 request_option_provider=request_options_provider, 3346 cursor=cursor, 3347 config=config, 3348 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3349 additional_query_properties=query_properties, 3350 log_formatter=self._get_log_formatter(log_formatter, name), 3351 parameters=model.parameters or {}, 3352 ) 3353 3354 def _get_log_formatter( 3355 self, log_formatter: Callable[[Response], Any] | None, name: str 3356 ) -> Callable[[Response], Any] | None: 3357 if self._should_limit_slices_fetched(): 3358 return ( 3359 ( 3360 lambda response: format_http_message( 3361 response, 3362 f"Stream '{name}' request", 3363 f"Request performed in order to extract records for stream '{name}'", 3364 name, 3365 ) 3366 ) 3367 if not log_formatter 3368 else log_formatter 3369 ) 3370 return None 3371 3372 def _should_limit_slices_fetched(self) -> bool: 3373 """ 3374 Returns True if the number of slices fetched should be limited, False otherwise. 3375 This is used to limit the number of slices fetched during tests. 3376 """ 3377 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3378 3379 @staticmethod 3380 def _query_properties_in_request_parameters( 3381 requester: Union[HttpRequesterModel, CustomRequesterModel], 3382 ) -> bool: 3383 if not hasattr(requester, "request_parameters"): 3384 return False 3385 request_parameters = requester.request_parameters 3386 if request_parameters and isinstance(request_parameters, Mapping): 3387 for request_parameter in request_parameters.values(): 3388 if isinstance(request_parameter, QueryPropertiesModel): 3389 return True 3390 return False 3391 3392 @staticmethod 3393 def _remove_query_properties( 3394 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3395 ) -> Mapping[str, str]: 3396 return { 3397 parameter_field: request_parameter 3398 for parameter_field, request_parameter in request_parameters.items() 3399 if not isinstance(request_parameter, QueryPropertiesModel) 3400 } 3401 3402 def create_state_delegating_stream( 3403 self, 3404 model: StateDelegatingStreamModel, 3405 config: Config, 3406 has_parent_state: Optional[bool] = None, 3407 **kwargs: Any, 3408 ) -> DeclarativeStream: 3409 if ( 3410 model.full_refresh_stream.name != model.name 3411 or model.name != model.incremental_stream.name 3412 ): 3413 raise ValueError( 3414 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3415 ) 3416 3417 stream_model = ( 3418 model.incremental_stream 3419 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3420 else model.full_refresh_stream 3421 ) 3422 3423 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description 3424 3425 def _create_async_job_status_mapping( 3426 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3427 ) -> Mapping[str, AsyncJobStatus]: 3428 api_status_to_cdk_status = {} 3429 for cdk_status, api_statuses in model.dict().items(): 3430 if cdk_status == "type": 3431 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3432 continue 3433 3434 for status in api_statuses: 3435 if status in api_status_to_cdk_status: 3436 raise ValueError( 3437 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3438 ) 3439 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3440 return api_status_to_cdk_status 3441 3442 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3443 match status: 3444 case "running": 3445 return AsyncJobStatus.RUNNING 3446 case "completed": 3447 return AsyncJobStatus.COMPLETED 3448 case "failed": 3449 return AsyncJobStatus.FAILED 3450 case "timeout": 3451 return AsyncJobStatus.TIMED_OUT 3452 case _: 3453 raise ValueError(f"Unsupported CDK status {status}") 3454 3455 def create_async_retriever( 3456 self, 3457 model: AsyncRetrieverModel, 3458 config: Config, 3459 *, 3460 name: str, 3461 primary_key: Optional[ 3462 Union[str, List[str], List[List[str]]] 3463 ], # this seems to be needed to match create_simple_retriever 3464 stream_slicer: Optional[StreamSlicer], 3465 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3466 transformations: List[RecordTransformation], 3467 **kwargs: Any, 3468 ) -> AsyncRetriever: 3469 def _get_download_retriever() -> SimpleRetriever: 3470 # We create a record selector for the download retriever 3471 # with no schema normalization and no transformations, neither record filter 3472 # as all this occurs in the record_selector of the AsyncRetriever 3473 record_selector = RecordSelector( 3474 extractor=download_extractor, 3475 name=name, 3476 record_filter=None, 3477 transformations=[], 3478 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3479 config=config, 3480 parameters={}, 3481 ) 3482 paginator = ( 3483 self._create_component_from_model( 3484 model=model.download_paginator, 3485 decoder=decoder, 3486 config=config, 3487 url_base="", 3488 ) 3489 if model.download_paginator 3490 else NoPagination(parameters={}) 3491 ) 3492 3493 return SimpleRetriever( 3494 requester=download_requester, 3495 record_selector=record_selector, 3496 primary_key=None, 3497 name=job_download_components_name, 3498 paginator=paginator, 3499 config=config, 3500 parameters={}, 3501 ) 3502 3503 def _get_job_timeout() -> datetime.timedelta: 3504 user_defined_timeout: Optional[int] = ( 3505 int( 3506 InterpolatedString.create( 3507 str(model.polling_job_timeout), 3508 parameters={}, 3509 ).eval(config) 3510 ) 3511 if model.polling_job_timeout 3512 else None 3513 ) 3514 3515 # check for user defined timeout during the test read or 15 minutes 3516 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3517 # default value for non-connector builder is 60 minutes. 3518 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3519 3520 return ( 3521 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3522 ) 3523 3524 decoder = ( 3525 self._create_component_from_model(model=model.decoder, config=config) 3526 if model.decoder 3527 else JsonDecoder(parameters={}) 3528 ) 3529 record_selector = self._create_component_from_model( 3530 model=model.record_selector, 3531 config=config, 3532 decoder=decoder, 3533 name=name, 3534 transformations=transformations, 3535 client_side_incremental_sync=client_side_incremental_sync, 3536 ) 3537 3538 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3539 if self._should_limit_slices_fetched(): 3540 stream_slicer = cast( 3541 StreamSlicer, 3542 StreamSlicerTestReadDecorator( 3543 wrapped_slicer=stream_slicer, 3544 maximum_number_of_slices=self._limit_slices_fetched or 5, 3545 ), 3546 ) 3547 3548 creation_requester = self._create_component_from_model( 3549 model=model.creation_requester, 3550 decoder=decoder, 3551 config=config, 3552 name=f"job creation - {name}", 3553 ) 3554 polling_requester = self._create_component_from_model( 3555 model=model.polling_requester, 3556 decoder=decoder, 3557 config=config, 3558 name=f"job polling - {name}", 3559 ) 3560 job_download_components_name = f"job download - {name}" 3561 download_decoder = ( 3562 self._create_component_from_model(model=model.download_decoder, config=config) 3563 if model.download_decoder 3564 else JsonDecoder(parameters={}) 3565 ) 3566 download_extractor = ( 3567 self._create_component_from_model( 3568 model=model.download_extractor, 3569 config=config, 3570 decoder=download_decoder, 3571 parameters=model.parameters, 3572 ) 3573 if model.download_extractor 3574 else DpathExtractor( 3575 [], 3576 config=config, 3577 decoder=download_decoder, 3578 parameters=model.parameters or {}, 3579 ) 3580 ) 3581 download_requester = self._create_component_from_model( 3582 model=model.download_requester, 3583 decoder=download_decoder, 3584 config=config, 3585 name=job_download_components_name, 3586 ) 3587 download_retriever = _get_download_retriever() 3588 abort_requester = ( 3589 self._create_component_from_model( 3590 model=model.abort_requester, 3591 decoder=decoder, 3592 config=config, 3593 name=f"job abort - {name}", 3594 ) 3595 if model.abort_requester 3596 else None 3597 ) 3598 delete_requester = ( 3599 self._create_component_from_model( 3600 model=model.delete_requester, 3601 decoder=decoder, 3602 config=config, 3603 name=f"job delete - {name}", 3604 ) 3605 if model.delete_requester 3606 else None 3607 ) 3608 download_target_requester = ( 3609 self._create_component_from_model( 3610 model=model.download_target_requester, 3611 decoder=decoder, 3612 config=config, 3613 name=f"job extract_url - {name}", 3614 ) 3615 if model.download_target_requester 3616 else None 3617 ) 3618 status_extractor = self._create_component_from_model( 3619 model=model.status_extractor, decoder=decoder, config=config, name=name 3620 ) 3621 download_target_extractor = self._create_component_from_model( 3622 model=model.download_target_extractor, 3623 decoder=decoder, 3624 config=config, 3625 name=name, 3626 ) 3627 3628 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3629 creation_requester=creation_requester, 3630 polling_requester=polling_requester, 3631 download_retriever=download_retriever, 3632 download_target_requester=download_target_requester, 3633 abort_requester=abort_requester, 3634 delete_requester=delete_requester, 3635 status_extractor=status_extractor, 3636 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3637 download_target_extractor=download_target_extractor, 3638 job_timeout=_get_job_timeout(), 3639 ) 3640 3641 async_job_partition_router = AsyncJobPartitionRouter( 3642 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3643 job_repository, 3644 stream_slices, 3645 self._job_tracker, 3646 self._message_repository, 3647 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3648 has_bulk_parent=False, 3649 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3650 # `None` == default retry is set to 3 attempts, under the hood. 3651 job_max_retry=1 if self._emit_connector_builder_messages else None, 3652 ), 3653 stream_slicer=stream_slicer, 3654 config=config, 3655 parameters=model.parameters or {}, 3656 ) 3657 3658 return AsyncRetriever( 3659 record_selector=record_selector, 3660 stream_slicer=async_job_partition_router, 3661 config=config, 3662 parameters=model.parameters or {}, 3663 ) 3664 3665 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3666 config_migrations = [ 3667 self._create_component_from_model(migration, config) 3668 for migration in ( 3669 model.config_normalization_rules.config_migrations 3670 if ( 3671 model.config_normalization_rules 3672 and model.config_normalization_rules.config_migrations 3673 ) 3674 else [] 3675 ) 3676 ] 3677 config_transformations = [ 3678 self._create_component_from_model(transformation, config) 3679 for transformation in ( 3680 model.config_normalization_rules.transformations 3681 if ( 3682 model.config_normalization_rules 3683 and model.config_normalization_rules.transformations 3684 ) 3685 else [] 3686 ) 3687 ] 3688 config_validations = [ 3689 self._create_component_from_model(validation, config) 3690 for validation in ( 3691 model.config_normalization_rules.validations 3692 if ( 3693 model.config_normalization_rules 3694 and model.config_normalization_rules.validations 3695 ) 3696 else [] 3697 ) 3698 ] 3699 3700 return Spec( 3701 connection_specification=model.connection_specification, 3702 documentation_url=model.documentation_url, 3703 advanced_auth=model.advanced_auth, 3704 parameters={}, 3705 config_migrations=config_migrations, 3706 config_transformations=config_transformations, 3707 config_validations=config_validations, 3708 ) 3709 3710 def create_substream_partition_router( 3711 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3712 ) -> SubstreamPartitionRouter: 3713 parent_stream_configs = [] 3714 if model.parent_stream_configs: 3715 parent_stream_configs.extend( 3716 [ 3717 self._create_message_repository_substream_wrapper( 3718 model=parent_stream_config, config=config, **kwargs 3719 ) 3720 for parent_stream_config in model.parent_stream_configs 3721 ] 3722 ) 3723 3724 return SubstreamPartitionRouter( 3725 parent_stream_configs=parent_stream_configs, 3726 parameters=model.parameters or {}, 3727 config=config, 3728 ) 3729 3730 def _create_message_repository_substream_wrapper( 3731 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 3732 ) -> Any: 3733 substream_factory = ModelToComponentFactory( 3734 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3735 limit_slices_fetched=self._limit_slices_fetched, 3736 emit_connector_builder_messages=self._emit_connector_builder_messages, 3737 disable_retries=self._disable_retries, 3738 disable_cache=self._disable_cache, 3739 message_repository=LogAppenderMessageRepositoryDecorator( 3740 {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}}, 3741 self._message_repository, 3742 self._evaluate_log_level(self._emit_connector_builder_messages), 3743 ), 3744 ) 3745 3746 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3747 has_parent_state = bool( 3748 self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None) 3749 if model.incremental_dependency 3750 else False 3751 ) 3752 return substream_factory._create_component_from_model( 3753 model=model, config=config, has_parent_state=has_parent_state, **kwargs 3754 ) 3755 3756 @staticmethod 3757 def create_wait_time_from_header( 3758 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3759 ) -> WaitTimeFromHeaderBackoffStrategy: 3760 return WaitTimeFromHeaderBackoffStrategy( 3761 header=model.header, 3762 parameters=model.parameters or {}, 3763 config=config, 3764 regex=model.regex, 3765 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3766 if model.max_waiting_time_in_seconds is not None 3767 else None, 3768 ) 3769 3770 @staticmethod 3771 def create_wait_until_time_from_header( 3772 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3773 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3774 return WaitUntilTimeFromHeaderBackoffStrategy( 3775 header=model.header, 3776 parameters=model.parameters or {}, 3777 config=config, 3778 min_wait=model.min_wait, 3779 regex=model.regex, 3780 ) 3781 3782 def get_message_repository(self) -> MessageRepository: 3783 return self._message_repository 3784 3785 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3786 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3787 3788 @staticmethod 3789 def create_components_mapping_definition( 3790 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3791 ) -> ComponentMappingDefinition: 3792 interpolated_value = InterpolatedString.create( 3793 model.value, parameters=model.parameters or {} 3794 ) 3795 field_path = [ 3796 InterpolatedString.create(path, parameters=model.parameters or {}) 3797 for path in model.field_path 3798 ] 3799 return ComponentMappingDefinition( 3800 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3801 value=interpolated_value, 3802 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3803 create_or_update=model.create_or_update, 3804 parameters=model.parameters or {}, 3805 ) 3806 3807 def create_http_components_resolver( 3808 self, model: HttpComponentsResolverModel, config: Config 3809 ) -> Any: 3810 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3811 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3812 3813 retriever = self._create_component_from_model( 3814 model=model.retriever, 3815 config=config, 3816 name="", 3817 primary_key=None, 3818 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3819 transformations=[], 3820 ) 3821 3822 components_mapping = [ 3823 self._create_component_from_model( 3824 model=components_mapping_definition_model, 3825 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3826 components_mapping_definition_model.value_type 3827 ), 3828 config=config, 3829 ) 3830 for components_mapping_definition_model in model.components_mapping 3831 ] 3832 3833 return HttpComponentsResolver( 3834 retriever=retriever, 3835 config=config, 3836 components_mapping=components_mapping, 3837 parameters=model.parameters or {}, 3838 ) 3839 3840 @staticmethod 3841 def create_stream_config( 3842 model: StreamConfigModel, config: Config, **kwargs: Any 3843 ) -> StreamConfig: 3844 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3845 [x for x in model.configs_pointer] if model.configs_pointer else [] 3846 ) 3847 3848 return StreamConfig( 3849 configs_pointer=model_configs_pointer, 3850 default_values=model.default_values, 3851 parameters=model.parameters or {}, 3852 ) 3853 3854 def create_config_components_resolver( 3855 self, model: ConfigComponentsResolverModel, config: Config 3856 ) -> Any: 3857 model_stream_configs = ( 3858 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3859 ) 3860 3861 stream_configs = [ 3862 self._create_component_from_model( 3863 stream_config, config=config, parameters=model.parameters or {} 3864 ) 3865 for stream_config in model_stream_configs 3866 ] 3867 3868 components_mapping = [ 3869 self._create_component_from_model( 3870 model=components_mapping_definition_model, 3871 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3872 components_mapping_definition_model.value_type 3873 ), 3874 config=config, 3875 ) 3876 for components_mapping_definition_model in model.components_mapping 3877 ] 3878 3879 return ConfigComponentsResolver( 3880 stream_configs=stream_configs, 3881 config=config, 3882 components_mapping=components_mapping, 3883 parameters=model.parameters or {}, 3884 ) 3885 3886 def create_parametrized_components_resolver( 3887 self, model: ParametrizedComponentsResolverModel, config: Config 3888 ) -> ParametrizedComponentsResolver: 3889 stream_parameters = StreamParametersDefinition( 3890 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 3891 ) 3892 components_mapping = [ 3893 self._create_component_from_model( 3894 model=components_mapping_definition_model, 3895 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3896 components_mapping_definition_model.value_type 3897 ), 3898 config=config, 3899 ) 3900 for components_mapping_definition_model in model.components_mapping 3901 ] 3902 return ParametrizedComponentsResolver( 3903 stream_parameters=stream_parameters, 3904 config=config, 3905 components_mapping=components_mapping, 3906 parameters=model.parameters or {}, 3907 ) 3908 3909 _UNSUPPORTED_DECODER_ERROR = ( 3910 "Specified decoder of {decoder_type} is not supported for pagination." 3911 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 3912 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 3913 ) 3914 3915 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 3916 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 3917 return True 3918 elif isinstance(decoder, CompositeRawDecoder): 3919 return self._is_supported_parser_for_pagination(decoder.parser) 3920 else: 3921 return False 3922 3923 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 3924 if isinstance(parser, JsonParser): 3925 return True 3926 elif isinstance(parser, GzipParser): 3927 return isinstance(parser.inner_parser, JsonParser) 3928 else: 3929 return False 3930 3931 def create_http_api_budget( 3932 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3933 ) -> HttpAPIBudget: 3934 policies = [ 3935 self._create_component_from_model(model=policy, config=config) 3936 for policy in model.policies 3937 ] 3938 3939 return HttpAPIBudget( 3940 policies=policies, 3941 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 3942 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 3943 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 3944 ) 3945 3946 def create_fixed_window_call_rate_policy( 3947 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 3948 ) -> FixedWindowCallRatePolicy: 3949 matchers = [ 3950 self._create_component_from_model(model=matcher, config=config) 3951 for matcher in model.matchers 3952 ] 3953 3954 # Set the initial reset timestamp to 10 days from now. 3955 # This value will be updated by the first request. 3956 return FixedWindowCallRatePolicy( 3957 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 3958 period=parse_duration(model.period), 3959 call_limit=model.call_limit, 3960 matchers=matchers, 3961 ) 3962 3963 def create_file_uploader( 3964 self, model: FileUploaderModel, config: Config, **kwargs: Any 3965 ) -> FileUploader: 3966 name = "File Uploader" 3967 requester = self._create_component_from_model( 3968 model=model.requester, 3969 config=config, 3970 name=name, 3971 **kwargs, 3972 ) 3973 download_target_extractor = self._create_component_from_model( 3974 model=model.download_target_extractor, 3975 config=config, 3976 name=name, 3977 **kwargs, 3978 ) 3979 emit_connector_builder_messages = self._emit_connector_builder_messages 3980 file_uploader = DefaultFileUploader( 3981 requester=requester, 3982 download_target_extractor=download_target_extractor, 3983 config=config, 3984 file_writer=NoopFileWriter() 3985 if emit_connector_builder_messages 3986 else LocalFileSystemFileWriter(), 3987 parameters=model.parameters or {}, 3988 filename_extractor=model.filename_extractor if model.filename_extractor else None, 3989 ) 3990 3991 return ( 3992 ConnectorBuilderFileUploader(file_uploader) 3993 if emit_connector_builder_messages 3994 else file_uploader 3995 ) 3996 3997 def create_moving_window_call_rate_policy( 3998 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 3999 ) -> MovingWindowCallRatePolicy: 4000 rates = [ 4001 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4002 ] 4003 matchers = [ 4004 self._create_component_from_model(model=matcher, config=config) 4005 for matcher in model.matchers 4006 ] 4007 return MovingWindowCallRatePolicy( 4008 rates=rates, 4009 matchers=matchers, 4010 ) 4011 4012 def create_unlimited_call_rate_policy( 4013 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4014 ) -> UnlimitedCallRatePolicy: 4015 matchers = [ 4016 self._create_component_from_model(model=matcher, config=config) 4017 for matcher in model.matchers 4018 ] 4019 4020 return UnlimitedCallRatePolicy( 4021 matchers=matchers, 4022 ) 4023 4024 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4025 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4026 return Rate( 4027 limit=int(interpolated_limit.eval(config=config)), 4028 interval=parse_duration(model.interval), 4029 ) 4030 4031 def create_http_request_matcher( 4032 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4033 ) -> HttpRequestRegexMatcher: 4034 return HttpRequestRegexMatcher( 4035 method=model.method, 4036 url_base=model.url_base, 4037 url_path_pattern=model.url_path_pattern, 4038 params=model.params, 4039 headers=model.headers, 4040 ) 4041 4042 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4043 self._api_budget = self.create_component( 4044 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4045 ) 4046 4047 def create_grouping_partition_router( 4048 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 4049 ) -> GroupingPartitionRouter: 4050 underlying_router = self._create_component_from_model( 4051 model=model.underlying_partition_router, config=config 4052 ) 4053 if model.group_size < 1: 4054 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4055 4056 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4057 # because they are specific to individual partitions and cannot be aggregated or handled 4058 # when grouping, potentially leading to incorrect API calls. Any request customization 4059 # should be managed at the stream level through the requester's configuration. 4060 if isinstance(underlying_router, SubstreamPartitionRouter): 4061 if any( 4062 parent_config.request_option 4063 for parent_config in underlying_router.parent_stream_configs 4064 ): 4065 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4066 4067 if isinstance(underlying_router, ListPartitionRouter): 4068 if underlying_router.request_option: 4069 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4070 4071 return GroupingPartitionRouter( 4072 group_size=model.group_size, 4073 underlying_partition_router=underlying_router, 4074 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4075 config=config, 4076 )
631 def __init__( 632 self, 633 limit_pages_fetched_per_slice: Optional[int] = None, 634 limit_slices_fetched: Optional[int] = None, 635 emit_connector_builder_messages: bool = False, 636 disable_retries: bool = False, 637 disable_cache: bool = False, 638 disable_resumable_full_refresh: bool = False, 639 message_repository: Optional[MessageRepository] = None, 640 connector_state_manager: Optional[ConnectorStateManager] = None, 641 max_concurrent_async_job_count: Optional[int] = None, 642 ): 643 self._init_mappings() 644 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 645 self._limit_slices_fetched = limit_slices_fetched 646 self._emit_connector_builder_messages = emit_connector_builder_messages 647 self._disable_retries = disable_retries 648 self._disable_cache = disable_cache 649 self._disable_resumable_full_refresh = disable_resumable_full_refresh 650 self._message_repository = message_repository or InMemoryMessageRepository( 651 self._evaluate_log_level(emit_connector_builder_messages) 652 ) 653 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 654 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 655 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 656 # placeholder for deprecation warnings 657 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = []
772 def create_component( 773 self, 774 model_type: Type[BaseModel], 775 component_definition: ComponentDefinition, 776 config: Config, 777 **kwargs: Any, 778 ) -> Any: 779 """ 780 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 781 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 782 creating declarative components from that model. 783 784 :param model_type: The type of declarative component that is being initialized 785 :param component_definition: The mapping that represents a declarative component 786 :param config: The connector config that is provided by the customer 787 :return: The declarative component to be used at runtime 788 """ 789 790 component_type = component_definition.get("type") 791 if component_definition.get("type") != model_type.__name__: 792 raise ValueError( 793 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 794 ) 795 796 declarative_component_model = model_type.parse_obj(component_definition) 797 798 if not isinstance(declarative_component_model, model_type): 799 raise ValueError( 800 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 801 ) 802 803 return self._create_component_from_model( 804 model=declarative_component_model, config=config, **kwargs 805 )
Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating creating declarative components from that model.
Parameters
- model_type: The type of declarative component that is being initialized
- component_definition: The mapping that represents a declarative component
- config: The connector config that is provided by the customer
Returns
The declarative component to be used at runtime
822 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 823 """ 824 Returns the deprecation warnings that were collected during the creation of components. 825 """ 826 return self._collected_deprecation_logs
Returns the deprecation warnings that were collected during the creation of components.
843 def create_config_migration( 844 self, model: ConfigMigrationModel, config: Config 845 ) -> ConfigMigration: 846 transformations: List[ConfigTransformation] = [ 847 self._create_component_from_model(transformation, config) 848 for transformation in model.transformations 849 ] 850 851 return ConfigMigration( 852 description=model.description, 853 transformations=transformations, 854 )
856 def create_config_add_fields( 857 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 858 ) -> ConfigAddFields: 859 fields = [self._create_component_from_model(field, config) for field in model.fields] 860 return ConfigAddFields( 861 fields=fields, 862 condition=model.condition or "", 863 )
912 @staticmethod 913 def create_added_field_definition( 914 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 915 ) -> AddedFieldDefinition: 916 interpolated_value = InterpolatedString.create( 917 model.value, parameters=model.parameters or {} 918 ) 919 return AddedFieldDefinition( 920 path=model.path, 921 value=interpolated_value, 922 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 923 parameters=model.parameters or {}, 924 )
926 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 927 added_field_definitions = [ 928 self._create_component_from_model( 929 model=added_field_definition_model, 930 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 931 added_field_definition_model.value_type 932 ), 933 config=config, 934 ) 935 for added_field_definition_model in model.fields 936 ] 937 return AddFields( 938 fields=added_field_definitions, 939 condition=model.condition or "", 940 parameters=model.parameters or {}, 941 )
967 def create_dpath_flatten_fields( 968 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 969 ) -> DpathFlattenFields: 970 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 971 key_transformation = ( 972 KeyTransformation( 973 config=config, 974 prefix=model.key_transformation.prefix, 975 suffix=model.key_transformation.suffix, 976 parameters=model.parameters or {}, 977 ) 978 if model.key_transformation is not None 979 else None 980 ) 981 return DpathFlattenFields( 982 config=config, 983 field_path=model_field_path, 984 delete_origin_value=model.delete_origin_value 985 if model.delete_origin_value is not None 986 else False, 987 replace_record=model.replace_record if model.replace_record is not None else False, 988 key_transformation=key_transformation, 989 parameters=model.parameters or {}, 990 )
1004 def create_api_key_authenticator( 1005 self, 1006 model: ApiKeyAuthenticatorModel, 1007 config: Config, 1008 token_provider: Optional[TokenProvider] = None, 1009 **kwargs: Any, 1010 ) -> ApiKeyAuthenticator: 1011 if model.inject_into is None and model.header is None: 1012 raise ValueError( 1013 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1014 ) 1015 1016 if model.inject_into is not None and model.header is not None: 1017 raise ValueError( 1018 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1019 ) 1020 1021 if token_provider is not None and model.api_token != "": 1022 raise ValueError( 1023 "If token_provider is set, api_token is ignored and has to be set to empty string." 1024 ) 1025 1026 request_option = ( 1027 self._create_component_from_model( 1028 model.inject_into, config, parameters=model.parameters or {} 1029 ) 1030 if model.inject_into 1031 else RequestOption( 1032 inject_into=RequestOptionType.header, 1033 field_name=model.header or "", 1034 parameters=model.parameters or {}, 1035 ) 1036 ) 1037 1038 return ApiKeyAuthenticator( 1039 token_provider=( 1040 token_provider 1041 if token_provider is not None 1042 else InterpolatedStringTokenProvider( 1043 api_token=model.api_token or "", 1044 config=config, 1045 parameters=model.parameters or {}, 1046 ) 1047 ), 1048 request_option=request_option, 1049 config=config, 1050 parameters=model.parameters or {}, 1051 )
1053 def create_legacy_to_per_partition_state_migration( 1054 self, 1055 model: LegacyToPerPartitionStateMigrationModel, 1056 config: Mapping[str, Any], 1057 declarative_stream: DeclarativeStreamModel, 1058 ) -> LegacyToPerPartitionStateMigration: 1059 retriever = declarative_stream.retriever 1060 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1061 raise ValueError( 1062 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1063 ) 1064 partition_router = retriever.partition_router 1065 if not isinstance( 1066 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1067 ): 1068 raise ValueError( 1069 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1070 ) 1071 if not hasattr(partition_router, "parent_stream_configs"): 1072 raise ValueError( 1073 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1074 ) 1075 1076 if not hasattr(declarative_stream, "incremental_sync"): 1077 raise ValueError( 1078 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1079 ) 1080 1081 return LegacyToPerPartitionStateMigration( 1082 partition_router, # type: ignore # was already checked above 1083 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1084 config, 1085 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1086 )
1088 def create_session_token_authenticator( 1089 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1090 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1091 decoder = ( 1092 self._create_component_from_model(model=model.decoder, config=config) 1093 if model.decoder 1094 else JsonDecoder(parameters={}) 1095 ) 1096 login_requester = self._create_component_from_model( 1097 model=model.login_requester, 1098 config=config, 1099 name=f"{name}_login_requester", 1100 decoder=decoder, 1101 ) 1102 token_provider = SessionTokenProvider( 1103 login_requester=login_requester, 1104 session_token_path=model.session_token_path, 1105 expiration_duration=parse_duration(model.expiration_duration) 1106 if model.expiration_duration 1107 else None, 1108 parameters=model.parameters or {}, 1109 message_repository=self._message_repository, 1110 decoder=decoder, 1111 ) 1112 if model.request_authentication.type == "Bearer": 1113 return ModelToComponentFactory.create_bearer_authenticator( 1114 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1115 config, 1116 token_provider=token_provider, 1117 ) 1118 else: 1119 return self.create_api_key_authenticator( 1120 ApiKeyAuthenticatorModel( 1121 type="ApiKeyAuthenticator", 1122 api_token="", 1123 inject_into=model.request_authentication.inject_into, 1124 ), # type: ignore # $parameters and headers default to None 1125 config=config, 1126 token_provider=token_provider, 1127 )
1129 @staticmethod 1130 def create_basic_http_authenticator( 1131 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1132 ) -> BasicHttpAuthenticator: 1133 return BasicHttpAuthenticator( 1134 password=model.password or "", 1135 username=model.username, 1136 config=config, 1137 parameters=model.parameters or {}, 1138 )
1140 @staticmethod 1141 def create_bearer_authenticator( 1142 model: BearerAuthenticatorModel, 1143 config: Config, 1144 token_provider: Optional[TokenProvider] = None, 1145 **kwargs: Any, 1146 ) -> BearerAuthenticator: 1147 if token_provider is not None and model.api_token != "": 1148 raise ValueError( 1149 "If token_provider is set, api_token is ignored and has to be set to empty string." 1150 ) 1151 return BearerAuthenticator( 1152 token_provider=( 1153 token_provider 1154 if token_provider is not None 1155 else InterpolatedStringTokenProvider( 1156 api_token=model.api_token or "", 1157 config=config, 1158 parameters=model.parameters or {}, 1159 ) 1160 ), 1161 config=config, 1162 parameters=model.parameters or {}, 1163 )
1165 @staticmethod 1166 def create_dynamic_stream_check_config( 1167 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1168 ) -> DynamicStreamCheckConfig: 1169 return DynamicStreamCheckConfig( 1170 dynamic_stream_name=model.dynamic_stream_name, 1171 stream_count=model.stream_count or 0, 1172 )
1174 def create_check_stream( 1175 self, model: CheckStreamModel, config: Config, **kwargs: Any 1176 ) -> CheckStream: 1177 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1178 raise ValueError( 1179 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1180 ) 1181 1182 dynamic_streams_check_configs = ( 1183 [ 1184 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1185 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1186 ] 1187 if model.dynamic_streams_check_configs 1188 else [] 1189 ) 1190 1191 return CheckStream( 1192 stream_names=model.stream_names or [], 1193 dynamic_streams_check_configs=dynamic_streams_check_configs, 1194 parameters={}, 1195 )
1197 @staticmethod 1198 def create_check_dynamic_stream( 1199 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1200 ) -> CheckDynamicStream: 1201 assert model.use_check_availability is not None # for mypy 1202 1203 use_check_availability = model.use_check_availability 1204 1205 return CheckDynamicStream( 1206 stream_count=model.stream_count, 1207 use_check_availability=use_check_availability, 1208 parameters={}, 1209 )
1211 def create_composite_error_handler( 1212 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1213 ) -> CompositeErrorHandler: 1214 error_handlers = [ 1215 self._create_component_from_model(model=error_handler_model, config=config) 1216 for error_handler_model in model.error_handlers 1217 ] 1218 return CompositeErrorHandler( 1219 error_handlers=error_handlers, parameters=model.parameters or {} 1220 )
1222 @staticmethod 1223 def create_concurrency_level( 1224 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1225 ) -> ConcurrencyLevel: 1226 return ConcurrencyLevel( 1227 default_concurrency=model.default_concurrency, 1228 max_concurrency=model.max_concurrency, 1229 config=config, 1230 parameters={}, 1231 )
1233 @staticmethod 1234 def apply_stream_state_migrations( 1235 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1236 ) -> MutableMapping[str, Any]: 1237 if stream_state_migrations: 1238 for state_migration in stream_state_migrations: 1239 if state_migration.should_migrate(stream_state): 1240 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1241 stream_state = dict(state_migration.migrate(stream_state)) 1242 return stream_state
1244 def create_concurrent_cursor_from_datetime_based_cursor( 1245 self, 1246 model_type: Type[BaseModel], 1247 component_definition: ComponentDefinition, 1248 stream_name: str, 1249 stream_namespace: Optional[str], 1250 config: Config, 1251 message_repository: Optional[MessageRepository] = None, 1252 runtime_lookback_window: Optional[datetime.timedelta] = None, 1253 stream_state_migrations: Optional[List[Any]] = None, 1254 **kwargs: Any, 1255 ) -> ConcurrentCursor: 1256 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1257 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1258 # incoming state and connector_state_manager that is initialized when the component factory is created 1259 stream_state = ( 1260 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1261 if "stream_state" not in kwargs 1262 else kwargs["stream_state"] 1263 ) 1264 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1265 1266 component_type = component_definition.get("type") 1267 if component_definition.get("type") != model_type.__name__: 1268 raise ValueError( 1269 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1270 ) 1271 1272 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1273 1274 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1275 raise ValueError( 1276 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1277 ) 1278 1279 interpolated_cursor_field = InterpolatedString.create( 1280 datetime_based_cursor_model.cursor_field, 1281 parameters=datetime_based_cursor_model.parameters or {}, 1282 ) 1283 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1284 1285 interpolated_partition_field_start = InterpolatedString.create( 1286 datetime_based_cursor_model.partition_field_start or "start_time", 1287 parameters=datetime_based_cursor_model.parameters or {}, 1288 ) 1289 interpolated_partition_field_end = InterpolatedString.create( 1290 datetime_based_cursor_model.partition_field_end or "end_time", 1291 parameters=datetime_based_cursor_model.parameters or {}, 1292 ) 1293 1294 slice_boundary_fields = ( 1295 interpolated_partition_field_start.eval(config=config), 1296 interpolated_partition_field_end.eval(config=config), 1297 ) 1298 1299 datetime_format = datetime_based_cursor_model.datetime_format 1300 1301 cursor_granularity = ( 1302 parse_duration(datetime_based_cursor_model.cursor_granularity) 1303 if datetime_based_cursor_model.cursor_granularity 1304 else None 1305 ) 1306 1307 lookback_window = None 1308 interpolated_lookback_window = ( 1309 InterpolatedString.create( 1310 datetime_based_cursor_model.lookback_window, 1311 parameters=datetime_based_cursor_model.parameters or {}, 1312 ) 1313 if datetime_based_cursor_model.lookback_window 1314 else None 1315 ) 1316 if interpolated_lookback_window: 1317 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1318 if evaluated_lookback_window: 1319 lookback_window = parse_duration(evaluated_lookback_window) 1320 1321 connector_state_converter: DateTimeStreamStateConverter 1322 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1323 datetime_format=datetime_format, 1324 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1325 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1326 cursor_granularity=cursor_granularity, 1327 ) 1328 1329 # Adjusts the stream state by applying the runtime lookback window. 1330 # This is used to ensure correct state handling in case of failed partitions. 1331 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1332 if runtime_lookback_window and stream_state_value: 1333 new_stream_state = ( 1334 connector_state_converter.parse_timestamp(stream_state_value) 1335 - runtime_lookback_window 1336 ) 1337 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1338 new_stream_state 1339 ) 1340 1341 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1342 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1343 start_date_runtime_value = self.create_min_max_datetime( 1344 model=datetime_based_cursor_model.start_datetime, config=config 1345 ) 1346 else: 1347 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1348 1349 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1350 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1351 end_date_runtime_value = self.create_min_max_datetime( 1352 model=datetime_based_cursor_model.end_datetime, config=config 1353 ) 1354 else: 1355 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1356 1357 interpolated_start_date = MinMaxDatetime.create( 1358 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1359 parameters=datetime_based_cursor_model.parameters, 1360 ) 1361 interpolated_end_date = ( 1362 None 1363 if not end_date_runtime_value 1364 else MinMaxDatetime.create( 1365 end_date_runtime_value, datetime_based_cursor_model.parameters 1366 ) 1367 ) 1368 1369 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1370 if not interpolated_start_date.datetime_format: 1371 interpolated_start_date.datetime_format = datetime_format 1372 if interpolated_end_date and not interpolated_end_date.datetime_format: 1373 interpolated_end_date.datetime_format = datetime_format 1374 1375 start_date = interpolated_start_date.get_datetime(config=config) 1376 end_date_provider = ( 1377 partial(interpolated_end_date.get_datetime, config) 1378 if interpolated_end_date 1379 else connector_state_converter.get_end_provider() 1380 ) 1381 1382 if ( 1383 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1384 ) or ( 1385 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1386 ): 1387 raise ValueError( 1388 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1389 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1390 ) 1391 1392 # When step is not defined, default to a step size from the starting date to the present moment 1393 step_length = datetime.timedelta.max 1394 interpolated_step = ( 1395 InterpolatedString.create( 1396 datetime_based_cursor_model.step, 1397 parameters=datetime_based_cursor_model.parameters or {}, 1398 ) 1399 if datetime_based_cursor_model.step 1400 else None 1401 ) 1402 if interpolated_step: 1403 evaluated_step = interpolated_step.eval(config) 1404 if evaluated_step: 1405 step_length = parse_duration(evaluated_step) 1406 1407 clamping_strategy: ClampingStrategy = NoClamping() 1408 if datetime_based_cursor_model.clamping: 1409 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1410 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1411 # object which we want to keep agnostic of being low-code 1412 target = InterpolatedString( 1413 string=datetime_based_cursor_model.clamping.target, 1414 parameters=datetime_based_cursor_model.parameters or {}, 1415 ) 1416 evaluated_target = target.eval(config=config) 1417 match evaluated_target: 1418 case "DAY": 1419 clamping_strategy = DayClampingStrategy() 1420 end_date_provider = ClampingEndProvider( 1421 DayClampingStrategy(is_ceiling=False), 1422 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1423 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1424 ) 1425 case "WEEK": 1426 if ( 1427 not datetime_based_cursor_model.clamping.target_details 1428 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1429 ): 1430 raise ValueError( 1431 "Given WEEK clamping, weekday needs to be provided as target_details" 1432 ) 1433 weekday = self._assemble_weekday( 1434 datetime_based_cursor_model.clamping.target_details["weekday"] 1435 ) 1436 clamping_strategy = WeekClampingStrategy(weekday) 1437 end_date_provider = ClampingEndProvider( 1438 WeekClampingStrategy(weekday, is_ceiling=False), 1439 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1440 granularity=cursor_granularity or datetime.timedelta(days=1), 1441 ) 1442 case "MONTH": 1443 clamping_strategy = MonthClampingStrategy() 1444 end_date_provider = ClampingEndProvider( 1445 MonthClampingStrategy(is_ceiling=False), 1446 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1447 granularity=cursor_granularity or datetime.timedelta(days=1), 1448 ) 1449 case _: 1450 raise ValueError( 1451 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1452 ) 1453 1454 return ConcurrentCursor( 1455 stream_name=stream_name, 1456 stream_namespace=stream_namespace, 1457 stream_state=stream_state, 1458 message_repository=message_repository or self._message_repository, 1459 connector_state_manager=self._connector_state_manager, 1460 connector_state_converter=connector_state_converter, 1461 cursor_field=cursor_field, 1462 slice_boundary_fields=slice_boundary_fields, 1463 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1464 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1465 lookback_window=lookback_window, 1466 slice_range=step_length, 1467 cursor_granularity=cursor_granularity, 1468 clamping_strategy=clamping_strategy, 1469 )
1471 def create_concurrent_cursor_from_incrementing_count_cursor( 1472 self, 1473 model_type: Type[BaseModel], 1474 component_definition: ComponentDefinition, 1475 stream_name: str, 1476 stream_namespace: Optional[str], 1477 config: Config, 1478 message_repository: Optional[MessageRepository] = None, 1479 **kwargs: Any, 1480 ) -> ConcurrentCursor: 1481 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1482 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1483 # incoming state and connector_state_manager that is initialized when the component factory is created 1484 stream_state = ( 1485 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1486 if "stream_state" not in kwargs 1487 else kwargs["stream_state"] 1488 ) 1489 1490 component_type = component_definition.get("type") 1491 if component_definition.get("type") != model_type.__name__: 1492 raise ValueError( 1493 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1494 ) 1495 1496 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1497 1498 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1499 raise ValueError( 1500 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1501 ) 1502 1503 interpolated_start_value = ( 1504 InterpolatedString.create( 1505 incrementing_count_cursor_model.start_value, # type: ignore 1506 parameters=incrementing_count_cursor_model.parameters or {}, 1507 ) 1508 if incrementing_count_cursor_model.start_value 1509 else 0 1510 ) 1511 1512 interpolated_cursor_field = InterpolatedString.create( 1513 incrementing_count_cursor_model.cursor_field, 1514 parameters=incrementing_count_cursor_model.parameters or {}, 1515 ) 1516 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1517 1518 connector_state_converter = IncrementingCountStreamStateConverter( 1519 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1520 ) 1521 1522 return ConcurrentCursor( 1523 stream_name=stream_name, 1524 stream_namespace=stream_namespace, 1525 stream_state=stream_state, 1526 message_repository=message_repository or self._message_repository, 1527 connector_state_manager=self._connector_state_manager, 1528 connector_state_converter=connector_state_converter, 1529 cursor_field=cursor_field, 1530 slice_boundary_fields=None, 1531 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1532 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1533 )
1554 def create_concurrent_cursor_from_perpartition_cursor( 1555 self, 1556 state_manager: ConnectorStateManager, 1557 model_type: Type[BaseModel], 1558 component_definition: ComponentDefinition, 1559 stream_name: str, 1560 stream_namespace: Optional[str], 1561 config: Config, 1562 stream_state: MutableMapping[str, Any], 1563 partition_router: PartitionRouter, 1564 stream_state_migrations: Optional[List[Any]] = None, 1565 **kwargs: Any, 1566 ) -> ConcurrentPerPartitionCursor: 1567 component_type = component_definition.get("type") 1568 if component_definition.get("type") != model_type.__name__: 1569 raise ValueError( 1570 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1571 ) 1572 1573 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1574 1575 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1576 raise ValueError( 1577 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1578 ) 1579 1580 interpolated_cursor_field = InterpolatedString.create( 1581 datetime_based_cursor_model.cursor_field, 1582 parameters=datetime_based_cursor_model.parameters or {}, 1583 ) 1584 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1585 1586 datetime_format = datetime_based_cursor_model.datetime_format 1587 1588 cursor_granularity = ( 1589 parse_duration(datetime_based_cursor_model.cursor_granularity) 1590 if datetime_based_cursor_model.cursor_granularity 1591 else None 1592 ) 1593 1594 connector_state_converter: DateTimeStreamStateConverter 1595 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1596 datetime_format=datetime_format, 1597 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1598 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1599 cursor_granularity=cursor_granularity, 1600 ) 1601 1602 # Create the cursor factory 1603 cursor_factory = ConcurrentCursorFactory( 1604 partial( 1605 self.create_concurrent_cursor_from_datetime_based_cursor, 1606 state_manager=state_manager, 1607 model_type=model_type, 1608 component_definition=component_definition, 1609 stream_name=stream_name, 1610 stream_namespace=stream_namespace, 1611 config=config, 1612 message_repository=NoopMessageRepository(), 1613 stream_state_migrations=stream_state_migrations, 1614 ) 1615 ) 1616 1617 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1618 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1619 use_global_cursor = isinstance( 1620 partition_router, GroupingPartitionRouter 1621 ) or component_definition.get("global_substream_cursor", False) 1622 1623 # Return the concurrent cursor and state converter 1624 return ConcurrentPerPartitionCursor( 1625 cursor_factory=cursor_factory, 1626 partition_router=partition_router, 1627 stream_name=stream_name, 1628 stream_namespace=stream_namespace, 1629 stream_state=stream_state, 1630 message_repository=self._message_repository, # type: ignore 1631 connector_state_manager=state_manager, 1632 connector_state_converter=connector_state_converter, 1633 cursor_field=cursor_field, 1634 use_global_cursor=use_global_cursor, 1635 )
1637 @staticmethod 1638 def create_constant_backoff_strategy( 1639 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1640 ) -> ConstantBackoffStrategy: 1641 return ConstantBackoffStrategy( 1642 backoff_time_in_seconds=model.backoff_time_in_seconds, 1643 config=config, 1644 parameters=model.parameters or {}, 1645 )
1647 def create_cursor_pagination( 1648 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1649 ) -> CursorPaginationStrategy: 1650 if isinstance(decoder, PaginationDecoderDecorator): 1651 inner_decoder = decoder.decoder 1652 else: 1653 inner_decoder = decoder 1654 decoder = PaginationDecoderDecorator(decoder=decoder) 1655 1656 if self._is_supported_decoder_for_pagination(inner_decoder): 1657 decoder_to_use = decoder 1658 else: 1659 raise ValueError( 1660 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1661 ) 1662 1663 return CursorPaginationStrategy( 1664 cursor_value=model.cursor_value, 1665 decoder=decoder_to_use, 1666 page_size=model.page_size, 1667 stop_condition=model.stop_condition, 1668 config=config, 1669 parameters=model.parameters or {}, 1670 )
1672 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1673 """ 1674 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1675 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1676 :param model: The Pydantic model of the custom component being created 1677 :param config: The custom defined connector config 1678 :return: The declarative component built from the Pydantic model to be used at runtime 1679 """ 1680 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1681 component_fields = get_type_hints(custom_component_class) 1682 model_args = model.dict() 1683 model_args["config"] = config 1684 1685 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1686 # we defer to these arguments over the component's definition 1687 for key, arg in kwargs.items(): 1688 model_args[key] = arg 1689 1690 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1691 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1692 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1693 for model_field, model_value in model_args.items(): 1694 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1695 if ( 1696 isinstance(model_value, dict) 1697 and "type" not in model_value 1698 and model_field in component_fields 1699 ): 1700 derived_type = self._derive_component_type_from_type_hints( 1701 component_fields.get(model_field) 1702 ) 1703 if derived_type: 1704 model_value["type"] = derived_type 1705 1706 if self._is_component(model_value): 1707 model_args[model_field] = self._create_nested_component( 1708 model, model_field, model_value, config 1709 ) 1710 elif isinstance(model_value, list): 1711 vals = [] 1712 for v in model_value: 1713 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1714 derived_type = self._derive_component_type_from_type_hints( 1715 component_fields.get(model_field) 1716 ) 1717 if derived_type: 1718 v["type"] = derived_type 1719 if self._is_component(v): 1720 vals.append(self._create_nested_component(model, model_field, v, config)) 1721 else: 1722 vals.append(v) 1723 model_args[model_field] = vals 1724 1725 kwargs = { 1726 class_field: model_args[class_field] 1727 for class_field in component_fields.keys() 1728 if class_field in model_args 1729 } 1730 return custom_component_class(**kwargs)
Generically creates a custom component based on the model type and a class_name reference to the custom Python class being instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor
Parameters
- model: The Pydantic model of the custom component being created
- config: The custom defined connector config
Returns
The declarative component built from the Pydantic model to be used at runtime
1862 def create_datetime_based_cursor( 1863 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1864 ) -> DatetimeBasedCursor: 1865 start_datetime: Union[str, MinMaxDatetime] = ( 1866 model.start_datetime 1867 if isinstance(model.start_datetime, str) 1868 else self.create_min_max_datetime(model.start_datetime, config) 1869 ) 1870 end_datetime: Union[str, MinMaxDatetime, None] = None 1871 if model.is_data_feed and model.end_datetime: 1872 raise ValueError("Data feed does not support end_datetime") 1873 if model.is_data_feed and model.is_client_side_incremental: 1874 raise ValueError( 1875 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1876 ) 1877 if model.end_datetime: 1878 end_datetime = ( 1879 model.end_datetime 1880 if isinstance(model.end_datetime, str) 1881 else self.create_min_max_datetime(model.end_datetime, config) 1882 ) 1883 1884 end_time_option = ( 1885 self._create_component_from_model( 1886 model.end_time_option, config, parameters=model.parameters or {} 1887 ) 1888 if model.end_time_option 1889 else None 1890 ) 1891 start_time_option = ( 1892 self._create_component_from_model( 1893 model.start_time_option, config, parameters=model.parameters or {} 1894 ) 1895 if model.start_time_option 1896 else None 1897 ) 1898 1899 return DatetimeBasedCursor( 1900 cursor_field=model.cursor_field, 1901 cursor_datetime_formats=model.cursor_datetime_formats 1902 if model.cursor_datetime_formats 1903 else [], 1904 cursor_granularity=model.cursor_granularity, 1905 datetime_format=model.datetime_format, 1906 end_datetime=end_datetime, 1907 start_datetime=start_datetime, 1908 step=model.step, 1909 end_time_option=end_time_option, 1910 lookback_window=model.lookback_window, 1911 start_time_option=start_time_option, 1912 partition_field_end=model.partition_field_end, 1913 partition_field_start=model.partition_field_start, 1914 message_repository=self._message_repository, 1915 is_compare_strictly=model.is_compare_strictly, 1916 config=config, 1917 parameters=model.parameters or {}, 1918 )
1920 def create_declarative_stream( 1921 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1922 ) -> DeclarativeStream: 1923 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1924 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1925 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1926 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1927 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1928 1929 primary_key = model.primary_key.__root__ if model.primary_key else None 1930 stop_condition_on_cursor = ( 1931 model.incremental_sync 1932 and hasattr(model.incremental_sync, "is_data_feed") 1933 and model.incremental_sync.is_data_feed 1934 ) 1935 client_side_incremental_sync = None 1936 if ( 1937 model.incremental_sync 1938 and hasattr(model.incremental_sync, "is_client_side_incremental") 1939 and model.incremental_sync.is_client_side_incremental 1940 ): 1941 supported_slicers = ( 1942 DatetimeBasedCursor, 1943 GlobalSubstreamCursor, 1944 PerPartitionWithGlobalCursor, 1945 ) 1946 if combined_slicers and not isinstance(combined_slicers, supported_slicers): 1947 raise ValueError( 1948 "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead" 1949 ) 1950 cursor = ( 1951 combined_slicers 1952 if isinstance( 1953 combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor) 1954 ) 1955 else self._create_component_from_model(model=model.incremental_sync, config=config) 1956 ) 1957 1958 client_side_incremental_sync = {"cursor": cursor} 1959 1960 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1961 cursor_model = model.incremental_sync 1962 1963 end_time_option = ( 1964 self._create_component_from_model( 1965 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1966 ) 1967 if cursor_model.end_time_option 1968 else None 1969 ) 1970 start_time_option = ( 1971 self._create_component_from_model( 1972 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1973 ) 1974 if cursor_model.start_time_option 1975 else None 1976 ) 1977 1978 request_options_provider = DatetimeBasedRequestOptionsProvider( 1979 start_time_option=start_time_option, 1980 end_time_option=end_time_option, 1981 partition_field_start=cursor_model.partition_field_end, 1982 partition_field_end=cursor_model.partition_field_end, 1983 config=config, 1984 parameters=model.parameters or {}, 1985 ) 1986 elif model.incremental_sync and isinstance( 1987 model.incremental_sync, IncrementingCountCursorModel 1988 ): 1989 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1990 1991 start_time_option = ( 1992 self._create_component_from_model( 1993 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1994 config, 1995 parameters=cursor_model.parameters or {}, 1996 ) 1997 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1998 else None 1999 ) 2000 2001 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2002 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2003 partition_field_start = "start" 2004 2005 request_options_provider = DatetimeBasedRequestOptionsProvider( 2006 start_time_option=start_time_option, 2007 partition_field_start=partition_field_start, 2008 config=config, 2009 parameters=model.parameters or {}, 2010 ) 2011 else: 2012 request_options_provider = None 2013 2014 transformations = [] 2015 if model.transformations: 2016 for transformation_model in model.transformations: 2017 transformations.append( 2018 self._create_component_from_model(model=transformation_model, config=config) 2019 ) 2020 file_uploader = None 2021 if model.file_uploader: 2022 file_uploader = self._create_component_from_model( 2023 model=model.file_uploader, config=config 2024 ) 2025 2026 retriever = self._create_component_from_model( 2027 model=model.retriever, 2028 config=config, 2029 name=model.name, 2030 primary_key=primary_key, 2031 stream_slicer=combined_slicers, 2032 request_options_provider=request_options_provider, 2033 stop_condition_on_cursor=stop_condition_on_cursor, 2034 client_side_incremental_sync=client_side_incremental_sync, 2035 transformations=transformations, 2036 file_uploader=file_uploader, 2037 incremental_sync=model.incremental_sync, 2038 ) 2039 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 2040 2041 if model.state_migrations: 2042 state_transformations = [ 2043 self._create_component_from_model(state_migration, config, declarative_stream=model) 2044 for state_migration in model.state_migrations 2045 ] 2046 else: 2047 state_transformations = [] 2048 2049 schema_loader: Union[ 2050 CompositeSchemaLoader, 2051 DefaultSchemaLoader, 2052 DynamicSchemaLoader, 2053 InlineSchemaLoader, 2054 JsonFileSchemaLoader, 2055 ] 2056 if model.schema_loader and isinstance(model.schema_loader, list): 2057 nested_schema_loaders = [ 2058 self._create_component_from_model(model=nested_schema_loader, config=config) 2059 for nested_schema_loader in model.schema_loader 2060 ] 2061 schema_loader = CompositeSchemaLoader( 2062 schema_loaders=nested_schema_loaders, parameters={} 2063 ) 2064 elif model.schema_loader: 2065 schema_loader = self._create_component_from_model( 2066 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2067 config=config, 2068 ) 2069 else: 2070 options = model.parameters or {} 2071 if "name" not in options: 2072 options["name"] = model.name 2073 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2074 2075 return DeclarativeStream( 2076 name=model.name or "", 2077 primary_key=primary_key, 2078 retriever=retriever, 2079 schema_loader=schema_loader, 2080 stream_cursor_field=cursor_field or "", 2081 state_migrations=state_transformations, 2082 config=config, 2083 parameters=model.parameters or {}, 2084 )
2253 def create_default_error_handler( 2254 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2255 ) -> DefaultErrorHandler: 2256 backoff_strategies = [] 2257 if model.backoff_strategies: 2258 for backoff_strategy_model in model.backoff_strategies: 2259 backoff_strategies.append( 2260 self._create_component_from_model(model=backoff_strategy_model, config=config) 2261 ) 2262 2263 response_filters = [] 2264 if model.response_filters: 2265 for response_filter_model in model.response_filters: 2266 response_filters.append( 2267 self._create_component_from_model(model=response_filter_model, config=config) 2268 ) 2269 response_filters.append( 2270 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2271 ) 2272 2273 return DefaultErrorHandler( 2274 backoff_strategies=backoff_strategies, 2275 max_retries=model.max_retries, 2276 response_filters=response_filters, 2277 config=config, 2278 parameters=model.parameters or {}, 2279 )
2281 def create_default_paginator( 2282 self, 2283 model: DefaultPaginatorModel, 2284 config: Config, 2285 *, 2286 url_base: str, 2287 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2288 decoder: Optional[Decoder] = None, 2289 cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None, 2290 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2291 if decoder: 2292 if self._is_supported_decoder_for_pagination(decoder): 2293 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2294 else: 2295 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2296 else: 2297 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2298 page_size_option = ( 2299 self._create_component_from_model(model=model.page_size_option, config=config) 2300 if model.page_size_option 2301 else None 2302 ) 2303 page_token_option = ( 2304 self._create_component_from_model(model=model.page_token_option, config=config) 2305 if model.page_token_option 2306 else None 2307 ) 2308 pagination_strategy = self._create_component_from_model( 2309 model=model.pagination_strategy, 2310 config=config, 2311 decoder=decoder_to_use, 2312 extractor_model=extractor_model, 2313 ) 2314 if cursor_used_for_stop_condition: 2315 pagination_strategy = StopConditionPaginationStrategyDecorator( 2316 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2317 ) 2318 paginator = DefaultPaginator( 2319 decoder=decoder_to_use, 2320 page_size_option=page_size_option, 2321 page_token_option=page_token_option, 2322 pagination_strategy=pagination_strategy, 2323 url_base=url_base, 2324 config=config, 2325 parameters=model.parameters or {}, 2326 ) 2327 if self._limit_pages_fetched_per_slice: 2328 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2329 return paginator
2331 def create_dpath_extractor( 2332 self, 2333 model: DpathExtractorModel, 2334 config: Config, 2335 decoder: Optional[Decoder] = None, 2336 **kwargs: Any, 2337 ) -> DpathExtractor: 2338 if decoder: 2339 decoder_to_use = decoder 2340 else: 2341 decoder_to_use = JsonDecoder(parameters={}) 2342 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2343 return DpathExtractor( 2344 decoder=decoder_to_use, 2345 field_path=model_field_path, 2346 config=config, 2347 parameters=model.parameters or {}, 2348 )
2369 def create_http_requester( 2370 self, 2371 model: HttpRequesterModel, 2372 config: Config, 2373 decoder: Decoder = JsonDecoder(parameters={}), 2374 query_properties_key: Optional[str] = None, 2375 use_cache: Optional[bool] = None, 2376 *, 2377 name: str, 2378 ) -> HttpRequester: 2379 authenticator = ( 2380 self._create_component_from_model( 2381 model=model.authenticator, 2382 config=config, 2383 url_base=model.url or model.url_base, 2384 name=name, 2385 decoder=decoder, 2386 ) 2387 if model.authenticator 2388 else None 2389 ) 2390 error_handler = ( 2391 self._create_component_from_model(model=model.error_handler, config=config) 2392 if model.error_handler 2393 else DefaultErrorHandler( 2394 backoff_strategies=[], 2395 response_filters=[], 2396 config=config, 2397 parameters=model.parameters or {}, 2398 ) 2399 ) 2400 2401 api_budget = self._api_budget 2402 2403 # Removes QueryProperties components from the interpolated mappings because it has been designed 2404 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2405 # instead of through jinja interpolation 2406 request_parameters: Optional[Union[str, Mapping[str, str]]] 2407 if isinstance(model.request_parameters, Mapping): 2408 request_parameters = self._remove_query_properties(model.request_parameters) 2409 else: 2410 request_parameters = model.request_parameters 2411 2412 request_options_provider = InterpolatedRequestOptionsProvider( 2413 request_body=model.request_body, 2414 request_body_data=model.request_body_data, 2415 request_body_json=model.request_body_json, 2416 request_headers=model.request_headers, 2417 request_parameters=request_parameters, 2418 query_properties_key=query_properties_key, 2419 config=config, 2420 parameters=model.parameters or {}, 2421 ) 2422 2423 assert model.use_cache is not None # for mypy 2424 assert model.http_method is not None # for mypy 2425 2426 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2427 2428 return HttpRequester( 2429 name=name, 2430 url=model.url, 2431 url_base=model.url_base, 2432 path=model.path, 2433 authenticator=authenticator, 2434 error_handler=error_handler, 2435 api_budget=api_budget, 2436 http_method=HttpMethod[model.http_method.value], 2437 request_options_provider=request_options_provider, 2438 config=config, 2439 disable_retries=self._disable_retries, 2440 parameters=model.parameters or {}, 2441 message_repository=self._message_repository, 2442 use_cache=should_use_cache, 2443 decoder=decoder, 2444 stream_response=decoder.is_stream_response() if decoder else False, 2445 )
2447 @staticmethod 2448 def create_http_response_filter( 2449 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2450 ) -> HttpResponseFilter: 2451 if model.action: 2452 action = ResponseAction(model.action.value) 2453 else: 2454 action = None 2455 2456 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2457 2458 http_codes = ( 2459 set(model.http_codes) if model.http_codes else set() 2460 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2461 2462 return HttpResponseFilter( 2463 action=action, 2464 failure_type=failure_type, 2465 error_message=model.error_message or "", 2466 error_message_contains=model.error_message_contains or "", 2467 http_codes=http_codes, 2468 predicate=model.predicate or "", 2469 config=config, 2470 parameters=model.parameters or {}, 2471 )
2479 def create_complex_field_type( 2480 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2481 ) -> ComplexFieldType: 2482 items = ( 2483 self._create_component_from_model(model=model.items, config=config) 2484 if isinstance(model.items, ComplexFieldTypeModel) 2485 else model.items 2486 ) 2487 2488 return ComplexFieldType(field_type=model.field_type, items=items)
2490 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2491 target_type = ( 2492 self._create_component_from_model(model=model.target_type, config=config) 2493 if isinstance(model.target_type, ComplexFieldTypeModel) 2494 else model.target_type 2495 ) 2496 2497 return TypesMap( 2498 target_type=target_type, 2499 current_type=model.current_type, 2500 condition=model.condition if model.condition is not None else "True", 2501 )
2503 def create_schema_type_identifier( 2504 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2505 ) -> SchemaTypeIdentifier: 2506 types_mapping = [] 2507 if model.types_mapping: 2508 types_mapping.extend( 2509 [ 2510 self._create_component_from_model(types_map, config=config) 2511 for types_map in model.types_mapping 2512 ] 2513 ) 2514 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2515 [x for x in model.schema_pointer] if model.schema_pointer else [] 2516 ) 2517 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2518 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2519 [x for x in model.type_pointer] if model.type_pointer else None 2520 ) 2521 2522 return SchemaTypeIdentifier( 2523 schema_pointer=model_schema_pointer, 2524 key_pointer=model_key_pointer, 2525 type_pointer=model_type_pointer, 2526 types_mapping=types_mapping, 2527 parameters=model.parameters or {}, 2528 )
2530 def create_dynamic_schema_loader( 2531 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2532 ) -> DynamicSchemaLoader: 2533 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2534 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2535 2536 schema_transformations = [] 2537 if model.schema_transformations: 2538 for transformation_model in model.schema_transformations: 2539 schema_transformations.append( 2540 self._create_component_from_model(model=transformation_model, config=config) 2541 ) 2542 name = "dynamic_properties" 2543 retriever = self._create_component_from_model( 2544 model=model.retriever, 2545 config=config, 2546 name=name, 2547 primary_key=None, 2548 stream_slicer=combined_slicers, 2549 transformations=[], 2550 use_cache=True, 2551 log_formatter=( 2552 lambda response: format_http_message( 2553 response, 2554 f"Schema loader '{name}' request", 2555 f"Request performed in order to extract schema.", 2556 name, 2557 is_auxiliary=True, 2558 ) 2559 ), 2560 ) 2561 schema_type_identifier = self._create_component_from_model( 2562 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2563 ) 2564 schema_filter = ( 2565 self._create_component_from_model( 2566 model.schema_filter, config=config, parameters=model.parameters or {} 2567 ) 2568 if model.schema_filter is not None 2569 else None 2570 ) 2571 2572 return DynamicSchemaLoader( 2573 retriever=retriever, 2574 config=config, 2575 schema_transformations=schema_transformations, 2576 schema_filter=schema_filter, 2577 schema_type_identifier=schema_type_identifier, 2578 parameters=model.parameters or {}, 2579 )
2599 def create_gzip_decoder( 2600 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2601 ) -> Decoder: 2602 _compressed_response_types = { 2603 "gzip", 2604 "x-gzip", 2605 "gzip, deflate", 2606 "x-gzip, deflate", 2607 "application/zip", 2608 "application/gzip", 2609 "application/x-gzip", 2610 "application/x-zip-compressed", 2611 } 2612 2613 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2614 2615 if self._emit_connector_builder_messages: 2616 # This is very surprising but if the response is not streamed, 2617 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2618 # which uses urllib3 directly and does not uncompress the data. 2619 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2620 2621 return CompositeRawDecoder.by_headers( 2622 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2623 stream_response=True, 2624 fallback_parser=gzip_parser.inner_parser, 2625 )
2627 @staticmethod 2628 def create_incrementing_count_cursor( 2629 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2630 ) -> DatetimeBasedCursor: 2631 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2632 # we still parse models into components. The issue is that there's no runtime implementation of a 2633 # IncrementingCountCursor. 2634 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2635 return DatetimeBasedCursor( 2636 cursor_field=model.cursor_field, 2637 datetime_format="%Y-%m-%d", 2638 start_datetime="2024-12-12", 2639 config=config, 2640 parameters={}, 2641 )
2690 @staticmethod 2691 def create_jwt_authenticator( 2692 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2693 ) -> JwtAuthenticator: 2694 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2695 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2696 return JwtAuthenticator( 2697 config=config, 2698 parameters=model.parameters or {}, 2699 algorithm=JwtAlgorithm(model.algorithm.value), 2700 secret_key=model.secret_key, 2701 base64_encode_secret_key=model.base64_encode_secret_key, 2702 token_duration=model.token_duration, 2703 header_prefix=model.header_prefix, 2704 kid=jwt_headers.kid, 2705 typ=jwt_headers.typ, 2706 cty=jwt_headers.cty, 2707 iss=jwt_payload.iss, 2708 sub=jwt_payload.sub, 2709 aud=jwt_payload.aud, 2710 additional_jwt_headers=model.additional_jwt_headers, 2711 additional_jwt_payload=model.additional_jwt_payload, 2712 )
2714 def create_list_partition_router( 2715 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2716 ) -> ListPartitionRouter: 2717 request_option = ( 2718 self._create_component_from_model(model.request_option, config) 2719 if model.request_option 2720 else None 2721 ) 2722 return ListPartitionRouter( 2723 cursor_field=model.cursor_field, 2724 request_option=request_option, 2725 values=model.values, 2726 config=config, 2727 parameters=model.parameters or {}, 2728 )
2730 @staticmethod 2731 def create_min_max_datetime( 2732 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2733 ) -> MinMaxDatetime: 2734 return MinMaxDatetime( 2735 datetime=model.datetime, 2736 datetime_format=model.datetime_format or "", 2737 max_datetime=model.max_datetime or "", 2738 min_datetime=model.min_datetime or "", 2739 parameters=model.parameters or {}, 2740 )
2752 def create_oauth_authenticator( 2753 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2754 ) -> DeclarativeOauth2Authenticator: 2755 profile_assertion = ( 2756 self._create_component_from_model(model.profile_assertion, config=config) 2757 if model.profile_assertion 2758 else None 2759 ) 2760 2761 if model.refresh_token_updater: 2762 # ignore type error because fixing it would have a lot of dependencies, revisit later 2763 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2764 config, 2765 InterpolatedString.create( 2766 model.token_refresh_endpoint, # type: ignore 2767 parameters=model.parameters or {}, 2768 ).eval(config), 2769 access_token_name=InterpolatedString.create( 2770 model.access_token_name or "access_token", parameters=model.parameters or {} 2771 ).eval(config), 2772 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2773 expires_in_name=InterpolatedString.create( 2774 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2775 ).eval(config), 2776 client_id_name=InterpolatedString.create( 2777 model.client_id_name or "client_id", parameters=model.parameters or {} 2778 ).eval(config), 2779 client_id=InterpolatedString.create( 2780 model.client_id, parameters=model.parameters or {} 2781 ).eval(config) 2782 if model.client_id 2783 else model.client_id, 2784 client_secret_name=InterpolatedString.create( 2785 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2786 ).eval(config), 2787 client_secret=InterpolatedString.create( 2788 model.client_secret, parameters=model.parameters or {} 2789 ).eval(config) 2790 if model.client_secret 2791 else model.client_secret, 2792 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2793 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2794 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2795 grant_type_name=InterpolatedString.create( 2796 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2797 ).eval(config), 2798 grant_type=InterpolatedString.create( 2799 model.grant_type or "refresh_token", parameters=model.parameters or {} 2800 ).eval(config), 2801 refresh_request_body=InterpolatedMapping( 2802 model.refresh_request_body or {}, parameters=model.parameters or {} 2803 ).eval(config), 2804 refresh_request_headers=InterpolatedMapping( 2805 model.refresh_request_headers or {}, parameters=model.parameters or {} 2806 ).eval(config), 2807 scopes=model.scopes, 2808 token_expiry_date_format=model.token_expiry_date_format, 2809 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2810 message_repository=self._message_repository, 2811 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2812 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2813 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2814 ) 2815 # ignore type error because fixing it would have a lot of dependencies, revisit later 2816 return DeclarativeOauth2Authenticator( # type: ignore 2817 access_token_name=model.access_token_name or "access_token", 2818 access_token_value=model.access_token_value, 2819 client_id_name=model.client_id_name or "client_id", 2820 client_id=model.client_id, 2821 client_secret_name=model.client_secret_name or "client_secret", 2822 client_secret=model.client_secret, 2823 expires_in_name=model.expires_in_name or "expires_in", 2824 grant_type_name=model.grant_type_name or "grant_type", 2825 grant_type=model.grant_type or "refresh_token", 2826 refresh_request_body=model.refresh_request_body, 2827 refresh_request_headers=model.refresh_request_headers, 2828 refresh_token_name=model.refresh_token_name or "refresh_token", 2829 refresh_token=model.refresh_token, 2830 scopes=model.scopes, 2831 token_expiry_date=model.token_expiry_date, 2832 token_expiry_date_format=model.token_expiry_date_format, 2833 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2834 token_refresh_endpoint=model.token_refresh_endpoint, 2835 config=config, 2836 parameters=model.parameters or {}, 2837 message_repository=self._message_repository, 2838 profile_assertion=profile_assertion, 2839 use_profile_assertion=model.use_profile_assertion, 2840 )
2842 def create_offset_increment( 2843 self, 2844 model: OffsetIncrementModel, 2845 config: Config, 2846 decoder: Decoder, 2847 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2848 **kwargs: Any, 2849 ) -> OffsetIncrement: 2850 if isinstance(decoder, PaginationDecoderDecorator): 2851 inner_decoder = decoder.decoder 2852 else: 2853 inner_decoder = decoder 2854 decoder = PaginationDecoderDecorator(decoder=decoder) 2855 2856 if self._is_supported_decoder_for_pagination(inner_decoder): 2857 decoder_to_use = decoder 2858 else: 2859 raise ValueError( 2860 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2861 ) 2862 2863 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2864 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2865 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2866 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2867 # When we have more time to investigate we can look into reusing the same component. 2868 extractor = ( 2869 self._create_component_from_model( 2870 model=extractor_model, config=config, decoder=decoder_to_use 2871 ) 2872 if extractor_model 2873 else None 2874 ) 2875 2876 return OffsetIncrement( 2877 page_size=model.page_size, 2878 config=config, 2879 decoder=decoder_to_use, 2880 extractor=extractor, 2881 inject_on_first_request=model.inject_on_first_request or False, 2882 parameters=model.parameters or {}, 2883 )
2885 @staticmethod 2886 def create_page_increment( 2887 model: PageIncrementModel, config: Config, **kwargs: Any 2888 ) -> PageIncrement: 2889 return PageIncrement( 2890 page_size=model.page_size, 2891 config=config, 2892 start_from_page=model.start_from_page or 0, 2893 inject_on_first_request=model.inject_on_first_request or False, 2894 parameters=model.parameters or {}, 2895 )
2897 def create_parent_stream_config( 2898 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2899 ) -> ParentStreamConfig: 2900 declarative_stream = self._create_component_from_model( 2901 model.stream, config=config, **kwargs 2902 ) 2903 request_option = ( 2904 self._create_component_from_model(model.request_option, config=config) 2905 if model.request_option 2906 else None 2907 ) 2908 2909 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2910 raise ValueError( 2911 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2912 ) 2913 2914 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2915 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2916 ) 2917 2918 return ParentStreamConfig( 2919 parent_key=model.parent_key, 2920 request_option=request_option, 2921 stream=declarative_stream, 2922 partition_field=model.partition_field, 2923 config=config, 2924 incremental_dependency=model.incremental_dependency or False, 2925 parameters=model.parameters or {}, 2926 extra_fields=model.extra_fields, 2927 lazy_read_pointer=model_lazy_read_pointer, 2928 )
2930 def create_properties_from_endpoint( 2931 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2932 ) -> PropertiesFromEndpoint: 2933 retriever = self._create_component_from_model( 2934 model=model.retriever, 2935 config=config, 2936 name="dynamic_properties", 2937 primary_key=None, 2938 stream_slicer=None, 2939 transformations=[], 2940 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2941 ) 2942 return PropertiesFromEndpoint( 2943 property_field_path=model.property_field_path, 2944 retriever=retriever, 2945 config=config, 2946 parameters=model.parameters or {}, 2947 )
2949 def create_property_chunking( 2950 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2951 ) -> PropertyChunking: 2952 record_merge_strategy = ( 2953 self._create_component_from_model( 2954 model=model.record_merge_strategy, config=config, **kwargs 2955 ) 2956 if model.record_merge_strategy 2957 else None 2958 ) 2959 2960 property_limit_type: PropertyLimitType 2961 match model.property_limit_type: 2962 case PropertyLimitTypeModel.property_count: 2963 property_limit_type = PropertyLimitType.property_count 2964 case PropertyLimitTypeModel.characters: 2965 property_limit_type = PropertyLimitType.characters 2966 case _: 2967 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2968 2969 return PropertyChunking( 2970 property_limit_type=property_limit_type, 2971 property_limit=model.property_limit, 2972 record_merge_strategy=record_merge_strategy, 2973 config=config, 2974 parameters=model.parameters or {}, 2975 )
2977 def create_query_properties( 2978 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2979 ) -> QueryProperties: 2980 if isinstance(model.property_list, list): 2981 property_list = model.property_list 2982 else: 2983 property_list = self._create_component_from_model( 2984 model=model.property_list, config=config, **kwargs 2985 ) 2986 2987 property_chunking = ( 2988 self._create_component_from_model( 2989 model=model.property_chunking, config=config, **kwargs 2990 ) 2991 if model.property_chunking 2992 else None 2993 ) 2994 2995 return QueryProperties( 2996 property_list=property_list, 2997 always_include_properties=model.always_include_properties, 2998 property_chunking=property_chunking, 2999 config=config, 3000 parameters=model.parameters or {}, 3001 )
3015 @staticmethod 3016 def create_request_option( 3017 model: RequestOptionModel, config: Config, **kwargs: Any 3018 ) -> RequestOption: 3019 inject_into = RequestOptionType(model.inject_into.value) 3020 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3021 [ 3022 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3023 for segment in model.field_path 3024 ] 3025 if model.field_path 3026 else None 3027 ) 3028 field_name = ( 3029 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3030 if model.field_name 3031 else None 3032 ) 3033 return RequestOption( 3034 field_name=field_name, 3035 field_path=field_path, 3036 inject_into=inject_into, 3037 parameters=kwargs.get("parameters", {}), 3038 )
3040 def create_record_selector( 3041 self, 3042 model: RecordSelectorModel, 3043 config: Config, 3044 *, 3045 name: str, 3046 transformations: List[RecordTransformation] | None = None, 3047 decoder: Decoder | None = None, 3048 client_side_incremental_sync: Dict[str, Any] | None = None, 3049 file_uploader: Optional[DefaultFileUploader] = None, 3050 **kwargs: Any, 3051 ) -> RecordSelector: 3052 extractor = self._create_component_from_model( 3053 model=model.extractor, decoder=decoder, config=config 3054 ) 3055 record_filter = ( 3056 self._create_component_from_model(model.record_filter, config=config) 3057 if model.record_filter 3058 else None 3059 ) 3060 3061 transform_before_filtering = ( 3062 False if model.transform_before_filtering is None else model.transform_before_filtering 3063 ) 3064 if client_side_incremental_sync: 3065 record_filter = ClientSideIncrementalRecordFilterDecorator( 3066 config=config, 3067 parameters=model.parameters, 3068 condition=model.record_filter.condition 3069 if (model.record_filter and hasattr(model.record_filter, "condition")) 3070 else None, 3071 **client_side_incremental_sync, 3072 ) 3073 transform_before_filtering = ( 3074 True 3075 if model.transform_before_filtering is None 3076 else model.transform_before_filtering 3077 ) 3078 3079 if model.schema_normalization is None: 3080 # default to no schema normalization if not set 3081 model.schema_normalization = SchemaNormalizationModel.None_ 3082 3083 schema_normalization = ( 3084 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3085 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3086 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3087 ) 3088 3089 return RecordSelector( 3090 extractor=extractor, 3091 name=name, 3092 config=config, 3093 record_filter=record_filter, 3094 transformations=transformations or [], 3095 file_uploader=file_uploader, 3096 schema_normalization=schema_normalization, 3097 parameters=model.parameters or {}, 3098 transform_before_filtering=transform_before_filtering, 3099 )
3109 def create_selective_authenticator( 3110 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3111 ) -> DeclarativeAuthenticator: 3112 authenticators = { 3113 name: self._create_component_from_model(model=auth, config=config) 3114 for name, auth in model.authenticators.items() 3115 } 3116 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3117 return SelectiveAuthenticator( # type: ignore[abstract] 3118 config=config, 3119 authenticators=authenticators, 3120 authenticator_selection_path=model.authenticator_selection_path, 3121 **kwargs, 3122 )
3124 @staticmethod 3125 def create_legacy_session_token_authenticator( 3126 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3127 ) -> LegacySessionTokenAuthenticator: 3128 return LegacySessionTokenAuthenticator( 3129 api_url=url_base, 3130 header=model.header, 3131 login_url=model.login_url, 3132 password=model.password or "", 3133 session_token=model.session_token or "", 3134 session_token_response_key=model.session_token_response_key or "", 3135 username=model.username or "", 3136 validate_session_url=model.validate_session_url, 3137 config=config, 3138 parameters=model.parameters or {}, 3139 )
3141 def create_simple_retriever( 3142 self, 3143 model: SimpleRetrieverModel, 3144 config: Config, 3145 *, 3146 name: str, 3147 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3148 stream_slicer: Optional[StreamSlicer], 3149 request_options_provider: Optional[RequestOptionsProvider] = None, 3150 stop_condition_on_cursor: bool = False, 3151 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3152 transformations: List[RecordTransformation], 3153 file_uploader: Optional[DefaultFileUploader] = None, 3154 incremental_sync: Optional[ 3155 Union[ 3156 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 3157 ] 3158 ] = None, 3159 use_cache: Optional[bool] = None, 3160 log_formatter: Optional[Callable[[Response], Any]] = None, 3161 **kwargs: Any, 3162 ) -> SimpleRetriever: 3163 def _get_url() -> str: 3164 """ 3165 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3166 This is needed because the URL is not set until the requester is created. 3167 """ 3168 3169 _url: str = ( 3170 model.requester.url 3171 if hasattr(model.requester, "url") and model.requester.url is not None 3172 else requester.get_url() 3173 ) 3174 _url_base: str = ( 3175 model.requester.url_base 3176 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3177 else requester.get_url_base() 3178 ) 3179 3180 return _url or _url_base 3181 3182 decoder = ( 3183 self._create_component_from_model(model=model.decoder, config=config) 3184 if model.decoder 3185 else JsonDecoder(parameters={}) 3186 ) 3187 record_selector = self._create_component_from_model( 3188 model=model.record_selector, 3189 name=name, 3190 config=config, 3191 decoder=decoder, 3192 transformations=transformations, 3193 client_side_incremental_sync=client_side_incremental_sync, 3194 file_uploader=file_uploader, 3195 ) 3196 3197 query_properties: Optional[QueryProperties] = None 3198 query_properties_key: Optional[str] = None 3199 if self._query_properties_in_request_parameters(model.requester): 3200 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3201 # places instead of default to request_parameters which isn't clearly documented 3202 if ( 3203 hasattr(model.requester, "fetch_properties_from_endpoint") 3204 and model.requester.fetch_properties_from_endpoint 3205 ): 3206 raise ValueError( 3207 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3208 ) 3209 3210 query_properties_definitions = [] 3211 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3212 if isinstance(request_parameter, QueryPropertiesModel): 3213 query_properties_key = key 3214 query_properties_definitions.append(request_parameter) 3215 3216 if len(query_properties_definitions) > 1: 3217 raise ValueError( 3218 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3219 ) 3220 3221 if len(query_properties_definitions) == 1: 3222 query_properties = self._create_component_from_model( 3223 model=query_properties_definitions[0], config=config 3224 ) 3225 elif ( 3226 hasattr(model.requester, "fetch_properties_from_endpoint") 3227 and model.requester.fetch_properties_from_endpoint 3228 ): 3229 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3230 query_properties_definition = QueryPropertiesModel( 3231 type="QueryProperties", 3232 property_list=model.requester.fetch_properties_from_endpoint, 3233 always_include_properties=None, 3234 property_chunking=None, 3235 ) # type: ignore # $parameters has a default value 3236 3237 query_properties = self.create_query_properties( 3238 model=query_properties_definition, 3239 config=config, 3240 ) 3241 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3242 query_properties = self.create_query_properties( 3243 model=model.requester.query_properties, 3244 config=config, 3245 ) 3246 3247 requester = self._create_component_from_model( 3248 model=model.requester, 3249 decoder=decoder, 3250 name=name, 3251 query_properties_key=query_properties_key, 3252 use_cache=use_cache, 3253 config=config, 3254 ) 3255 3256 # Define cursor only if per partition or common incremental support is needed 3257 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3258 3259 if ( 3260 not isinstance(stream_slicer, DatetimeBasedCursor) 3261 or type(stream_slicer) is not DatetimeBasedCursor 3262 ): 3263 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3264 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3265 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3266 # request_options_provider 3267 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3268 elif not request_options_provider: 3269 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3270 3271 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3272 if self._should_limit_slices_fetched(): 3273 stream_slicer = cast( 3274 StreamSlicer, 3275 StreamSlicerTestReadDecorator( 3276 wrapped_slicer=stream_slicer, 3277 maximum_number_of_slices=self._limit_slices_fetched or 5, 3278 ), 3279 ) 3280 3281 cursor_used_for_stop_condition = cursor if stop_condition_on_cursor else None 3282 paginator = ( 3283 self._create_component_from_model( 3284 model=model.paginator, 3285 config=config, 3286 url_base=_get_url(), 3287 extractor_model=model.record_selector.extractor, 3288 decoder=decoder, 3289 cursor_used_for_stop_condition=cursor_used_for_stop_condition, 3290 ) 3291 if model.paginator 3292 else NoPagination(parameters={}) 3293 ) 3294 3295 ignore_stream_slicer_parameters_on_paginated_requests = ( 3296 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3297 ) 3298 3299 if ( 3300 model.partition_router 3301 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3302 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3303 and any( 3304 parent_stream_config.lazy_read_pointer 3305 for parent_stream_config in model.partition_router.parent_stream_configs 3306 ) 3307 ): 3308 if incremental_sync: 3309 if incremental_sync.type != "DatetimeBasedCursor": 3310 raise ValueError( 3311 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3312 ) 3313 3314 elif incremental_sync.step or incremental_sync.cursor_granularity: 3315 raise ValueError( 3316 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3317 ) 3318 3319 if model.decoder and model.decoder.type != "JsonDecoder": 3320 raise ValueError( 3321 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3322 ) 3323 3324 return LazySimpleRetriever( 3325 name=name, 3326 paginator=paginator, 3327 primary_key=primary_key, 3328 requester=requester, 3329 record_selector=record_selector, 3330 stream_slicer=stream_slicer, 3331 request_option_provider=request_options_provider, 3332 cursor=cursor, 3333 config=config, 3334 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3335 parameters=model.parameters or {}, 3336 ) 3337 3338 return SimpleRetriever( 3339 name=name, 3340 paginator=paginator, 3341 primary_key=primary_key, 3342 requester=requester, 3343 record_selector=record_selector, 3344 stream_slicer=stream_slicer, 3345 request_option_provider=request_options_provider, 3346 cursor=cursor, 3347 config=config, 3348 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3349 additional_query_properties=query_properties, 3350 log_formatter=self._get_log_formatter(log_formatter, name), 3351 parameters=model.parameters or {}, 3352 )
3402 def create_state_delegating_stream( 3403 self, 3404 model: StateDelegatingStreamModel, 3405 config: Config, 3406 has_parent_state: Optional[bool] = None, 3407 **kwargs: Any, 3408 ) -> DeclarativeStream: 3409 if ( 3410 model.full_refresh_stream.name != model.name 3411 or model.name != model.incremental_stream.name 3412 ): 3413 raise ValueError( 3414 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3415 ) 3416 3417 stream_model = ( 3418 model.incremental_stream 3419 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3420 else model.full_refresh_stream 3421 ) 3422 3423 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description
3455 def create_async_retriever( 3456 self, 3457 model: AsyncRetrieverModel, 3458 config: Config, 3459 *, 3460 name: str, 3461 primary_key: Optional[ 3462 Union[str, List[str], List[List[str]]] 3463 ], # this seems to be needed to match create_simple_retriever 3464 stream_slicer: Optional[StreamSlicer], 3465 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3466 transformations: List[RecordTransformation], 3467 **kwargs: Any, 3468 ) -> AsyncRetriever: 3469 def _get_download_retriever() -> SimpleRetriever: 3470 # We create a record selector for the download retriever 3471 # with no schema normalization and no transformations, neither record filter 3472 # as all this occurs in the record_selector of the AsyncRetriever 3473 record_selector = RecordSelector( 3474 extractor=download_extractor, 3475 name=name, 3476 record_filter=None, 3477 transformations=[], 3478 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3479 config=config, 3480 parameters={}, 3481 ) 3482 paginator = ( 3483 self._create_component_from_model( 3484 model=model.download_paginator, 3485 decoder=decoder, 3486 config=config, 3487 url_base="", 3488 ) 3489 if model.download_paginator 3490 else NoPagination(parameters={}) 3491 ) 3492 3493 return SimpleRetriever( 3494 requester=download_requester, 3495 record_selector=record_selector, 3496 primary_key=None, 3497 name=job_download_components_name, 3498 paginator=paginator, 3499 config=config, 3500 parameters={}, 3501 ) 3502 3503 def _get_job_timeout() -> datetime.timedelta: 3504 user_defined_timeout: Optional[int] = ( 3505 int( 3506 InterpolatedString.create( 3507 str(model.polling_job_timeout), 3508 parameters={}, 3509 ).eval(config) 3510 ) 3511 if model.polling_job_timeout 3512 else None 3513 ) 3514 3515 # check for user defined timeout during the test read or 15 minutes 3516 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3517 # default value for non-connector builder is 60 minutes. 3518 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3519 3520 return ( 3521 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3522 ) 3523 3524 decoder = ( 3525 self._create_component_from_model(model=model.decoder, config=config) 3526 if model.decoder 3527 else JsonDecoder(parameters={}) 3528 ) 3529 record_selector = self._create_component_from_model( 3530 model=model.record_selector, 3531 config=config, 3532 decoder=decoder, 3533 name=name, 3534 transformations=transformations, 3535 client_side_incremental_sync=client_side_incremental_sync, 3536 ) 3537 3538 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3539 if self._should_limit_slices_fetched(): 3540 stream_slicer = cast( 3541 StreamSlicer, 3542 StreamSlicerTestReadDecorator( 3543 wrapped_slicer=stream_slicer, 3544 maximum_number_of_slices=self._limit_slices_fetched or 5, 3545 ), 3546 ) 3547 3548 creation_requester = self._create_component_from_model( 3549 model=model.creation_requester, 3550 decoder=decoder, 3551 config=config, 3552 name=f"job creation - {name}", 3553 ) 3554 polling_requester = self._create_component_from_model( 3555 model=model.polling_requester, 3556 decoder=decoder, 3557 config=config, 3558 name=f"job polling - {name}", 3559 ) 3560 job_download_components_name = f"job download - {name}" 3561 download_decoder = ( 3562 self._create_component_from_model(model=model.download_decoder, config=config) 3563 if model.download_decoder 3564 else JsonDecoder(parameters={}) 3565 ) 3566 download_extractor = ( 3567 self._create_component_from_model( 3568 model=model.download_extractor, 3569 config=config, 3570 decoder=download_decoder, 3571 parameters=model.parameters, 3572 ) 3573 if model.download_extractor 3574 else DpathExtractor( 3575 [], 3576 config=config, 3577 decoder=download_decoder, 3578 parameters=model.parameters or {}, 3579 ) 3580 ) 3581 download_requester = self._create_component_from_model( 3582 model=model.download_requester, 3583 decoder=download_decoder, 3584 config=config, 3585 name=job_download_components_name, 3586 ) 3587 download_retriever = _get_download_retriever() 3588 abort_requester = ( 3589 self._create_component_from_model( 3590 model=model.abort_requester, 3591 decoder=decoder, 3592 config=config, 3593 name=f"job abort - {name}", 3594 ) 3595 if model.abort_requester 3596 else None 3597 ) 3598 delete_requester = ( 3599 self._create_component_from_model( 3600 model=model.delete_requester, 3601 decoder=decoder, 3602 config=config, 3603 name=f"job delete - {name}", 3604 ) 3605 if model.delete_requester 3606 else None 3607 ) 3608 download_target_requester = ( 3609 self._create_component_from_model( 3610 model=model.download_target_requester, 3611 decoder=decoder, 3612 config=config, 3613 name=f"job extract_url - {name}", 3614 ) 3615 if model.download_target_requester 3616 else None 3617 ) 3618 status_extractor = self._create_component_from_model( 3619 model=model.status_extractor, decoder=decoder, config=config, name=name 3620 ) 3621 download_target_extractor = self._create_component_from_model( 3622 model=model.download_target_extractor, 3623 decoder=decoder, 3624 config=config, 3625 name=name, 3626 ) 3627 3628 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3629 creation_requester=creation_requester, 3630 polling_requester=polling_requester, 3631 download_retriever=download_retriever, 3632 download_target_requester=download_target_requester, 3633 abort_requester=abort_requester, 3634 delete_requester=delete_requester, 3635 status_extractor=status_extractor, 3636 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3637 download_target_extractor=download_target_extractor, 3638 job_timeout=_get_job_timeout(), 3639 ) 3640 3641 async_job_partition_router = AsyncJobPartitionRouter( 3642 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3643 job_repository, 3644 stream_slices, 3645 self._job_tracker, 3646 self._message_repository, 3647 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3648 has_bulk_parent=False, 3649 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3650 # `None` == default retry is set to 3 attempts, under the hood. 3651 job_max_retry=1 if self._emit_connector_builder_messages else None, 3652 ), 3653 stream_slicer=stream_slicer, 3654 config=config, 3655 parameters=model.parameters or {}, 3656 ) 3657 3658 return AsyncRetriever( 3659 record_selector=record_selector, 3660 stream_slicer=async_job_partition_router, 3661 config=config, 3662 parameters=model.parameters or {}, 3663 )
3665 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3666 config_migrations = [ 3667 self._create_component_from_model(migration, config) 3668 for migration in ( 3669 model.config_normalization_rules.config_migrations 3670 if ( 3671 model.config_normalization_rules 3672 and model.config_normalization_rules.config_migrations 3673 ) 3674 else [] 3675 ) 3676 ] 3677 config_transformations = [ 3678 self._create_component_from_model(transformation, config) 3679 for transformation in ( 3680 model.config_normalization_rules.transformations 3681 if ( 3682 model.config_normalization_rules 3683 and model.config_normalization_rules.transformations 3684 ) 3685 else [] 3686 ) 3687 ] 3688 config_validations = [ 3689 self._create_component_from_model(validation, config) 3690 for validation in ( 3691 model.config_normalization_rules.validations 3692 if ( 3693 model.config_normalization_rules 3694 and model.config_normalization_rules.validations 3695 ) 3696 else [] 3697 ) 3698 ] 3699 3700 return Spec( 3701 connection_specification=model.connection_specification, 3702 documentation_url=model.documentation_url, 3703 advanced_auth=model.advanced_auth, 3704 parameters={}, 3705 config_migrations=config_migrations, 3706 config_transformations=config_transformations, 3707 config_validations=config_validations, 3708 )
3710 def create_substream_partition_router( 3711 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3712 ) -> SubstreamPartitionRouter: 3713 parent_stream_configs = [] 3714 if model.parent_stream_configs: 3715 parent_stream_configs.extend( 3716 [ 3717 self._create_message_repository_substream_wrapper( 3718 model=parent_stream_config, config=config, **kwargs 3719 ) 3720 for parent_stream_config in model.parent_stream_configs 3721 ] 3722 ) 3723 3724 return SubstreamPartitionRouter( 3725 parent_stream_configs=parent_stream_configs, 3726 parameters=model.parameters or {}, 3727 config=config, 3728 )
3756 @staticmethod 3757 def create_wait_time_from_header( 3758 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3759 ) -> WaitTimeFromHeaderBackoffStrategy: 3760 return WaitTimeFromHeaderBackoffStrategy( 3761 header=model.header, 3762 parameters=model.parameters or {}, 3763 config=config, 3764 regex=model.regex, 3765 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3766 if model.max_waiting_time_in_seconds is not None 3767 else None, 3768 )
3770 @staticmethod 3771 def create_wait_until_time_from_header( 3772 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3773 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3774 return WaitUntilTimeFromHeaderBackoffStrategy( 3775 header=model.header, 3776 parameters=model.parameters or {}, 3777 config=config, 3778 min_wait=model.min_wait, 3779 regex=model.regex, 3780 )
3788 @staticmethod 3789 def create_components_mapping_definition( 3790 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3791 ) -> ComponentMappingDefinition: 3792 interpolated_value = InterpolatedString.create( 3793 model.value, parameters=model.parameters or {} 3794 ) 3795 field_path = [ 3796 InterpolatedString.create(path, parameters=model.parameters or {}) 3797 for path in model.field_path 3798 ] 3799 return ComponentMappingDefinition( 3800 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3801 value=interpolated_value, 3802 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3803 create_or_update=model.create_or_update, 3804 parameters=model.parameters or {}, 3805 )
3807 def create_http_components_resolver( 3808 self, model: HttpComponentsResolverModel, config: Config 3809 ) -> Any: 3810 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3811 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3812 3813 retriever = self._create_component_from_model( 3814 model=model.retriever, 3815 config=config, 3816 name="", 3817 primary_key=None, 3818 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3819 transformations=[], 3820 ) 3821 3822 components_mapping = [ 3823 self._create_component_from_model( 3824 model=components_mapping_definition_model, 3825 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3826 components_mapping_definition_model.value_type 3827 ), 3828 config=config, 3829 ) 3830 for components_mapping_definition_model in model.components_mapping 3831 ] 3832 3833 return HttpComponentsResolver( 3834 retriever=retriever, 3835 config=config, 3836 components_mapping=components_mapping, 3837 parameters=model.parameters or {}, 3838 )
3840 @staticmethod 3841 def create_stream_config( 3842 model: StreamConfigModel, config: Config, **kwargs: Any 3843 ) -> StreamConfig: 3844 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3845 [x for x in model.configs_pointer] if model.configs_pointer else [] 3846 ) 3847 3848 return StreamConfig( 3849 configs_pointer=model_configs_pointer, 3850 default_values=model.default_values, 3851 parameters=model.parameters or {}, 3852 )
3854 def create_config_components_resolver( 3855 self, model: ConfigComponentsResolverModel, config: Config 3856 ) -> Any: 3857 model_stream_configs = ( 3858 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3859 ) 3860 3861 stream_configs = [ 3862 self._create_component_from_model( 3863 stream_config, config=config, parameters=model.parameters or {} 3864 ) 3865 for stream_config in model_stream_configs 3866 ] 3867 3868 components_mapping = [ 3869 self._create_component_from_model( 3870 model=components_mapping_definition_model, 3871 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3872 components_mapping_definition_model.value_type 3873 ), 3874 config=config, 3875 ) 3876 for components_mapping_definition_model in model.components_mapping 3877 ] 3878 3879 return ConfigComponentsResolver( 3880 stream_configs=stream_configs, 3881 config=config, 3882 components_mapping=components_mapping, 3883 parameters=model.parameters or {}, 3884 )
3886 def create_parametrized_components_resolver( 3887 self, model: ParametrizedComponentsResolverModel, config: Config 3888 ) -> ParametrizedComponentsResolver: 3889 stream_parameters = StreamParametersDefinition( 3890 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 3891 ) 3892 components_mapping = [ 3893 self._create_component_from_model( 3894 model=components_mapping_definition_model, 3895 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3896 components_mapping_definition_model.value_type 3897 ), 3898 config=config, 3899 ) 3900 for components_mapping_definition_model in model.components_mapping 3901 ] 3902 return ParametrizedComponentsResolver( 3903 stream_parameters=stream_parameters, 3904 config=config, 3905 components_mapping=components_mapping, 3906 parameters=model.parameters or {}, 3907 )
3931 def create_http_api_budget( 3932 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3933 ) -> HttpAPIBudget: 3934 policies = [ 3935 self._create_component_from_model(model=policy, config=config) 3936 for policy in model.policies 3937 ] 3938 3939 return HttpAPIBudget( 3940 policies=policies, 3941 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 3942 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 3943 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 3944 )
3946 def create_fixed_window_call_rate_policy( 3947 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 3948 ) -> FixedWindowCallRatePolicy: 3949 matchers = [ 3950 self._create_component_from_model(model=matcher, config=config) 3951 for matcher in model.matchers 3952 ] 3953 3954 # Set the initial reset timestamp to 10 days from now. 3955 # This value will be updated by the first request. 3956 return FixedWindowCallRatePolicy( 3957 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 3958 period=parse_duration(model.period), 3959 call_limit=model.call_limit, 3960 matchers=matchers, 3961 )
3963 def create_file_uploader( 3964 self, model: FileUploaderModel, config: Config, **kwargs: Any 3965 ) -> FileUploader: 3966 name = "File Uploader" 3967 requester = self._create_component_from_model( 3968 model=model.requester, 3969 config=config, 3970 name=name, 3971 **kwargs, 3972 ) 3973 download_target_extractor = self._create_component_from_model( 3974 model=model.download_target_extractor, 3975 config=config, 3976 name=name, 3977 **kwargs, 3978 ) 3979 emit_connector_builder_messages = self._emit_connector_builder_messages 3980 file_uploader = DefaultFileUploader( 3981 requester=requester, 3982 download_target_extractor=download_target_extractor, 3983 config=config, 3984 file_writer=NoopFileWriter() 3985 if emit_connector_builder_messages 3986 else LocalFileSystemFileWriter(), 3987 parameters=model.parameters or {}, 3988 filename_extractor=model.filename_extractor if model.filename_extractor else None, 3989 ) 3990 3991 return ( 3992 ConnectorBuilderFileUploader(file_uploader) 3993 if emit_connector_builder_messages 3994 else file_uploader 3995 )
3997 def create_moving_window_call_rate_policy( 3998 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 3999 ) -> MovingWindowCallRatePolicy: 4000 rates = [ 4001 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4002 ] 4003 matchers = [ 4004 self._create_component_from_model(model=matcher, config=config) 4005 for matcher in model.matchers 4006 ] 4007 return MovingWindowCallRatePolicy( 4008 rates=rates, 4009 matchers=matchers, 4010 )
4012 def create_unlimited_call_rate_policy( 4013 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4014 ) -> UnlimitedCallRatePolicy: 4015 matchers = [ 4016 self._create_component_from_model(model=matcher, config=config) 4017 for matcher in model.matchers 4018 ] 4019 4020 return UnlimitedCallRatePolicy( 4021 matchers=matchers, 4022 )
4031 def create_http_request_matcher( 4032 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4033 ) -> HttpRequestRegexMatcher: 4034 return HttpRequestRegexMatcher( 4035 method=model.method, 4036 url_base=model.url_base, 4037 url_path_pattern=model.url_path_pattern, 4038 params=model.params, 4039 headers=model.headers, 4040 )
4047 def create_grouping_partition_router( 4048 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 4049 ) -> GroupingPartitionRouter: 4050 underlying_router = self._create_component_from_model( 4051 model=model.underlying_partition_router, config=config 4052 ) 4053 if model.group_size < 1: 4054 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4055 4056 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4057 # because they are specific to individual partitions and cannot be aggregated or handled 4058 # when grouping, potentially leading to incorrect API calls. Any request customization 4059 # should be managed at the stream level through the requester's configuration. 4060 if isinstance(underlying_router, SubstreamPartitionRouter): 4061 if any( 4062 parent_config.request_option 4063 for parent_config in underlying_router.parent_stream_configs 4064 ): 4065 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4066 4067 if isinstance(underlying_router, ListPartitionRouter): 4068 if underlying_router.request_option: 4069 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4070 4071 return GroupingPartitionRouter( 4072 group_size=model.group_size, 4073 underlying_partition_router=underlying_router, 4074 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4075 config=config, 4076 )