airbyte_cdk.sources.declarative.parsers.model_to_component_factory
1# 2# Copyright (c) 2025 Airbyte, Inc., all rights reserved. 3# 4 5from __future__ import annotations 6 7import datetime 8import importlib 9import inspect 10import re 11from functools import partial 12from typing import ( 13 Any, 14 Callable, 15 Dict, 16 List, 17 Mapping, 18 MutableMapping, 19 Optional, 20 Type, 21 Union, 22 cast, 23 get_args, 24 get_origin, 25 get_type_hints, 26) 27 28from isodate import parse_duration 29from pydantic.v1 import BaseModel 30from requests import Response 31 32from airbyte_cdk.connector_builder.models import ( 33 LogMessage as ConnectorBuilderLogMessage, 34) 35from airbyte_cdk.models import FailureType, Level 36from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager 37from airbyte_cdk.sources.declarative import transformations 38from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator 39from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker 40from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository 41from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus 42from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator 43from airbyte_cdk.sources.declarative.auth.declarative_authenticator import ( 44 DeclarativeAuthenticator, 45 NoAuth, 46) 47from airbyte_cdk.sources.declarative.auth.jwt import JwtAlgorithm 48from airbyte_cdk.sources.declarative.auth.oauth import ( 49 DeclarativeSingleUseRefreshTokenOauth2Authenticator, 50) 51from airbyte_cdk.sources.declarative.auth.selective_authenticator import SelectiveAuthenticator 52from airbyte_cdk.sources.declarative.auth.token import ( 53 ApiKeyAuthenticator, 54 BasicHttpAuthenticator, 55 BearerAuthenticator, 56 LegacySessionTokenAuthenticator, 57) 58from airbyte_cdk.sources.declarative.auth.token_provider import ( 59 InterpolatedStringTokenProvider, 60 SessionTokenProvider, 61 TokenProvider, 62) 63from airbyte_cdk.sources.declarative.checks import ( 64 CheckDynamicStream, 65 CheckStream, 66 DynamicStreamCheckConfig, 67) 68from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel 69from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime 70from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream 71from airbyte_cdk.sources.declarative.decoders import ( 72 Decoder, 73 IterableDecoder, 74 JsonDecoder, 75 PaginationDecoderDecorator, 76 XmlDecoder, 77 ZipfileDecoder, 78) 79from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import ( 80 CompositeRawDecoder, 81 CsvParser, 82 GzipParser, 83 JsonLineParser, 84 JsonParser, 85 Parser, 86) 87from airbyte_cdk.sources.declarative.extractors import ( 88 DpathExtractor, 89 RecordFilter, 90 RecordSelector, 91 ResponseToFileExtractor, 92) 93from airbyte_cdk.sources.declarative.extractors.record_filter import ( 94 ClientSideIncrementalRecordFilterDecorator, 95) 96from airbyte_cdk.sources.declarative.incremental import ( 97 ChildPartitionResumableFullRefreshCursor, 98 ConcurrentCursorFactory, 99 ConcurrentPerPartitionCursor, 100 CursorFactory, 101 DatetimeBasedCursor, 102 DeclarativeCursor, 103 GlobalSubstreamCursor, 104 PerPartitionCursor, 105 PerPartitionWithGlobalCursor, 106 ResumableFullRefreshCursor, 107) 108from airbyte_cdk.sources.declarative.interpolation import InterpolatedString 109from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping 110from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import ( 111 LegacyToPerPartitionStateMigration, 112) 113from airbyte_cdk.sources.declarative.models import ( 114 CustomStateMigration, 115) 116from airbyte_cdk.sources.declarative.models.base_model_with_deprecations import ( 117 DEPRECATION_LOGS_TAG, 118 BaseModelWithDeprecations, 119) 120from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 121 AddedFieldDefinition as AddedFieldDefinitionModel, 122) 123from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 124 AddFields as AddFieldsModel, 125) 126from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 127 ApiKeyAuthenticator as ApiKeyAuthenticatorModel, 128) 129from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 130 AsyncJobStatusMap as AsyncJobStatusMapModel, 131) 132from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 133 AsyncRetriever as AsyncRetrieverModel, 134) 135from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 136 BasicHttpAuthenticator as BasicHttpAuthenticatorModel, 137) 138from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 139 BearerAuthenticator as BearerAuthenticatorModel, 140) 141from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 142 CheckDynamicStream as CheckDynamicStreamModel, 143) 144from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 145 CheckStream as CheckStreamModel, 146) 147from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 148 ComplexFieldType as ComplexFieldTypeModel, 149) 150from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 151 ComponentMappingDefinition as ComponentMappingDefinitionModel, 152) 153from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 154 CompositeErrorHandler as CompositeErrorHandlerModel, 155) 156from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 157 ConcurrencyLevel as ConcurrencyLevelModel, 158) 159from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 160 ConfigAddFields as ConfigAddFieldsModel, 161) 162from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 163 ConfigComponentsResolver as ConfigComponentsResolverModel, 164) 165from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 166 ConfigMigration as ConfigMigrationModel, 167) 168from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 169 ConfigRemapField as ConfigRemapFieldModel, 170) 171from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 172 ConfigRemoveFields as ConfigRemoveFieldsModel, 173) 174from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 175 ConstantBackoffStrategy as ConstantBackoffStrategyModel, 176) 177from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 178 CsvDecoder as CsvDecoderModel, 179) 180from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 181 CursorPagination as CursorPaginationModel, 182) 183from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 184 CustomAuthenticator as CustomAuthenticatorModel, 185) 186from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 187 CustomBackoffStrategy as CustomBackoffStrategyModel, 188) 189from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 190 CustomDecoder as CustomDecoderModel, 191) 192from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 193 CustomErrorHandler as CustomErrorHandlerModel, 194) 195from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 196 CustomIncrementalSync as CustomIncrementalSyncModel, 197) 198from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 199 CustomPaginationStrategy as CustomPaginationStrategyModel, 200) 201from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 202 CustomPartitionRouter as CustomPartitionRouterModel, 203) 204from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 205 CustomRecordExtractor as CustomRecordExtractorModel, 206) 207from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 208 CustomRecordFilter as CustomRecordFilterModel, 209) 210from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 211 CustomRequester as CustomRequesterModel, 212) 213from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 214 CustomRetriever as CustomRetrieverModel, 215) 216from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 217 CustomSchemaLoader as CustomSchemaLoader, 218) 219from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 220 CustomSchemaNormalization as CustomSchemaNormalizationModel, 221) 222from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 223 CustomTransformation as CustomTransformationModel, 224) 225from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 226 DatetimeBasedCursor as DatetimeBasedCursorModel, 227) 228from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 229 DeclarativeStream as DeclarativeStreamModel, 230) 231from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 232 DefaultErrorHandler as DefaultErrorHandlerModel, 233) 234from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 235 DefaultPaginator as DefaultPaginatorModel, 236) 237from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 238 DpathExtractor as DpathExtractorModel, 239) 240from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 241 DpathFlattenFields as DpathFlattenFieldsModel, 242) 243from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 244 DpathValidator as DpathValidatorModel, 245) 246from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 247 DynamicSchemaLoader as DynamicSchemaLoaderModel, 248) 249from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 250 DynamicStreamCheckConfig as DynamicStreamCheckConfigModel, 251) 252from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 253 ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, 254) 255from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 256 FileUploader as FileUploaderModel, 257) 258from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 259 FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel, 260) 261from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 262 FlattenFields as FlattenFieldsModel, 263) 264from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 265 GroupByKeyMergeStrategy as GroupByKeyMergeStrategyModel, 266) 267from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 268 GroupingPartitionRouter as GroupingPartitionRouterModel, 269) 270from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 271 GzipDecoder as GzipDecoderModel, 272) 273from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 274 HTTPAPIBudget as HTTPAPIBudgetModel, 275) 276from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 277 HttpComponentsResolver as HttpComponentsResolverModel, 278) 279from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 280 HttpRequester as HttpRequesterModel, 281) 282from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 283 HttpRequestRegexMatcher as HttpRequestRegexMatcherModel, 284) 285from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 286 HttpResponseFilter as HttpResponseFilterModel, 287) 288from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 289 IncrementingCountCursor as IncrementingCountCursorModel, 290) 291from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 292 InlineSchemaLoader as InlineSchemaLoaderModel, 293) 294from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 295 IterableDecoder as IterableDecoderModel, 296) 297from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 298 JsonDecoder as JsonDecoderModel, 299) 300from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 301 JsonFileSchemaLoader as JsonFileSchemaLoaderModel, 302) 303from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 304 JsonlDecoder as JsonlDecoderModel, 305) 306from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 307 JwtAuthenticator as JwtAuthenticatorModel, 308) 309from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 310 JwtHeaders as JwtHeadersModel, 311) 312from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 313 JwtPayload as JwtPayloadModel, 314) 315from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 316 KeysReplace as KeysReplaceModel, 317) 318from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 319 KeysToLower as KeysToLowerModel, 320) 321from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 322 KeysToSnakeCase as KeysToSnakeCaseModel, 323) 324from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 325 LegacySessionTokenAuthenticator as LegacySessionTokenAuthenticatorModel, 326) 327from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 328 LegacyToPerPartitionStateMigration as LegacyToPerPartitionStateMigrationModel, 329) 330from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 331 ListPartitionRouter as ListPartitionRouterModel, 332) 333from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 334 MinMaxDatetime as MinMaxDatetimeModel, 335) 336from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 337 MovingWindowCallRatePolicy as MovingWindowCallRatePolicyModel, 338) 339from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 340 NoAuth as NoAuthModel, 341) 342from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 343 NoPagination as NoPaginationModel, 344) 345from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 346 OAuthAuthenticator as OAuthAuthenticatorModel, 347) 348from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 349 OffsetIncrement as OffsetIncrementModel, 350) 351from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 352 PageIncrement as PageIncrementModel, 353) 354from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 355 ParentStreamConfig as ParentStreamConfigModel, 356) 357from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 358 PredicateValidator as PredicateValidatorModel, 359) 360from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 361 PropertiesFromEndpoint as PropertiesFromEndpointModel, 362) 363from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 364 PropertyChunking as PropertyChunkingModel, 365) 366from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 367 PropertyLimitType as PropertyLimitTypeModel, 368) 369from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 370 QueryProperties as QueryPropertiesModel, 371) 372from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 373 Rate as RateModel, 374) 375from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 376 RecordFilter as RecordFilterModel, 377) 378from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 379 RecordSelector as RecordSelectorModel, 380) 381from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 382 RemoveFields as RemoveFieldsModel, 383) 384from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 385 RequestOption as RequestOptionModel, 386) 387from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 388 RequestPath as RequestPathModel, 389) 390from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 391 ResponseToFileExtractor as ResponseToFileExtractorModel, 392) 393from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 394 SchemaNormalization as SchemaNormalizationModel, 395) 396from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 397 SchemaTypeIdentifier as SchemaTypeIdentifierModel, 398) 399from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 400 SelectiveAuthenticator as SelectiveAuthenticatorModel, 401) 402from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 403 SessionTokenAuthenticator as SessionTokenAuthenticatorModel, 404) 405from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 406 SimpleRetriever as SimpleRetrieverModel, 407) 408from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel 409from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 410 StateDelegatingStream as StateDelegatingStreamModel, 411) 412from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 413 StreamConfig as StreamConfigModel, 414) 415from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 416 SubstreamPartitionRouter as SubstreamPartitionRouterModel, 417) 418from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 419 TypesMap as TypesMapModel, 420) 421from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 422 UnlimitedCallRatePolicy as UnlimitedCallRatePolicyModel, 423) 424from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 425 ValidateAdheresToSchema as ValidateAdheresToSchemaModel, 426) 427from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType 428from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 429 WaitTimeFromHeader as WaitTimeFromHeaderModel, 430) 431from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 432 WaitUntilTimeFromHeader as WaitUntilTimeFromHeaderModel, 433) 434from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 435 XmlDecoder as XmlDecoderModel, 436) 437from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 438 ZipfileDecoder as ZipfileDecoderModel, 439) 440from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import ( 441 COMPONENTS_MODULE_NAME, 442 SDM_COMPONENTS_MODULE_NAME, 443) 444from airbyte_cdk.sources.declarative.partition_routers import ( 445 CartesianProductStreamSlicer, 446 GroupingPartitionRouter, 447 ListPartitionRouter, 448 PartitionRouter, 449 SinglePartitionRouter, 450 SubstreamPartitionRouter, 451) 452from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import ( 453 AsyncJobPartitionRouter, 454) 455from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( 456 ParentStreamConfig, 457) 458from airbyte_cdk.sources.declarative.requesters import HttpRequester, RequestOption 459from airbyte_cdk.sources.declarative.requesters.error_handlers import ( 460 CompositeErrorHandler, 461 DefaultErrorHandler, 462 HttpResponseFilter, 463) 464from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ( 465 ConstantBackoffStrategy, 466 ExponentialBackoffStrategy, 467 WaitTimeFromHeaderBackoffStrategy, 468 WaitUntilTimeFromHeaderBackoffStrategy, 469) 470from airbyte_cdk.sources.declarative.requesters.http_job_repository import AsyncHttpJobRepository 471from airbyte_cdk.sources.declarative.requesters.paginators import ( 472 DefaultPaginator, 473 NoPagination, 474 PaginatorTestReadDecorator, 475) 476from airbyte_cdk.sources.declarative.requesters.paginators.strategies import ( 477 CursorPaginationStrategy, 478 CursorStopCondition, 479 OffsetIncrement, 480 PageIncrement, 481 StopConditionPaginationStrategyDecorator, 482) 483from airbyte_cdk.sources.declarative.requesters.query_properties import ( 484 PropertiesFromEndpoint, 485 PropertyChunking, 486 QueryProperties, 487) 488from airbyte_cdk.sources.declarative.requesters.query_properties.property_chunking import ( 489 PropertyLimitType, 490) 491from airbyte_cdk.sources.declarative.requesters.query_properties.strategies import ( 492 GroupByKey, 493) 494from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType 495from airbyte_cdk.sources.declarative.requesters.request_options import ( 496 DatetimeBasedRequestOptionsProvider, 497 DefaultRequestOptionsProvider, 498 InterpolatedRequestOptionsProvider, 499 RequestOptionsProvider, 500) 501from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath 502from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester 503from airbyte_cdk.sources.declarative.resolvers import ( 504 ComponentMappingDefinition, 505 ConfigComponentsResolver, 506 HttpComponentsResolver, 507 StreamConfig, 508) 509from airbyte_cdk.sources.declarative.retrievers import ( 510 AsyncRetriever, 511 LazySimpleRetriever, 512 SimpleRetriever, 513 SimpleRetrieverTestReadDecorator, 514) 515from airbyte_cdk.sources.declarative.retrievers.file_uploader import ( 516 ConnectorBuilderFileUploader, 517 DefaultFileUploader, 518 FileUploader, 519 LocalFileSystemFileWriter, 520 NoopFileWriter, 521) 522from airbyte_cdk.sources.declarative.schema import ( 523 ComplexFieldType, 524 DefaultSchemaLoader, 525 DynamicSchemaLoader, 526 InlineSchemaLoader, 527 JsonFileSchemaLoader, 528 SchemaTypeIdentifier, 529 TypesMap, 530) 531from airbyte_cdk.sources.declarative.schema.composite_schema_loader import CompositeSchemaLoader 532from airbyte_cdk.sources.declarative.spec import ConfigMigration, Spec 533from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicer 534from airbyte_cdk.sources.declarative.transformations import ( 535 AddFields, 536 RecordTransformation, 537 RemoveFields, 538) 539from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition 540from airbyte_cdk.sources.declarative.transformations.config_transformations import ( 541 ConfigAddFields, 542 ConfigRemapField, 543 ConfigRemoveFields, 544) 545from airbyte_cdk.sources.declarative.transformations.config_transformations.config_transformation import ( 546 ConfigTransformation, 547) 548from airbyte_cdk.sources.declarative.transformations.dpath_flatten_fields import ( 549 DpathFlattenFields, 550 KeyTransformation, 551) 552from airbyte_cdk.sources.declarative.transformations.flatten_fields import ( 553 FlattenFields, 554) 555from airbyte_cdk.sources.declarative.transformations.keys_replace_transformation import ( 556 KeysReplaceTransformation, 557) 558from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import ( 559 KeysToLowerTransformation, 560) 561from airbyte_cdk.sources.declarative.transformations.keys_to_snake_transformation import ( 562 KeysToSnakeCaseTransformation, 563) 564from airbyte_cdk.sources.declarative.validators import ( 565 DpathValidator, 566 PredicateValidator, 567 ValidateAdheresToSchema, 568) 569from airbyte_cdk.sources.http_logger import format_http_message 570from airbyte_cdk.sources.message import ( 571 InMemoryMessageRepository, 572 LogAppenderMessageRepositoryDecorator, 573 MessageRepository, 574 NoopMessageRepository, 575) 576from airbyte_cdk.sources.streams.call_rate import ( 577 APIBudget, 578 FixedWindowCallRatePolicy, 579 HttpAPIBudget, 580 HttpRequestRegexMatcher, 581 MovingWindowCallRatePolicy, 582 Rate, 583 UnlimitedCallRatePolicy, 584) 585from airbyte_cdk.sources.streams.concurrent.clamping import ( 586 ClampingEndProvider, 587 ClampingStrategy, 588 DayClampingStrategy, 589 MonthClampingStrategy, 590 NoClamping, 591 WeekClampingStrategy, 592 Weekday, 593) 594from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField 595from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( 596 CustomFormatConcurrentStreamStateConverter, 597 DateTimeStreamStateConverter, 598) 599from airbyte_cdk.sources.streams.concurrent.state_converters.incrementing_count_stream_state_converter import ( 600 IncrementingCountStreamStateConverter, 601) 602from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction 603from airbyte_cdk.sources.types import Config 604from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer 605 606ComponentDefinition = Mapping[str, Any] 607 608SCHEMA_TRANSFORMER_TYPE_MAPPING = { 609 SchemaNormalizationModel.None_: TransformConfig.NoTransform, 610 SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization, 611} 612 613 614class ModelToComponentFactory: 615 EPOCH_DATETIME_FORMAT = "%s" 616 617 def __init__( 618 self, 619 limit_pages_fetched_per_slice: Optional[int] = None, 620 limit_slices_fetched: Optional[int] = None, 621 emit_connector_builder_messages: bool = False, 622 disable_retries: bool = False, 623 disable_cache: bool = False, 624 disable_resumable_full_refresh: bool = False, 625 message_repository: Optional[MessageRepository] = None, 626 connector_state_manager: Optional[ConnectorStateManager] = None, 627 max_concurrent_async_job_count: Optional[int] = None, 628 ): 629 self._init_mappings() 630 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 631 self._limit_slices_fetched = limit_slices_fetched 632 self._emit_connector_builder_messages = emit_connector_builder_messages 633 self._disable_retries = disable_retries 634 self._disable_cache = disable_cache 635 self._disable_resumable_full_refresh = disable_resumable_full_refresh 636 self._message_repository = message_repository or InMemoryMessageRepository( 637 self._evaluate_log_level(emit_connector_builder_messages) 638 ) 639 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 640 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 641 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 642 # placeholder for deprecation warnings 643 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 644 645 def _init_mappings(self) -> None: 646 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 647 AddedFieldDefinitionModel: self.create_added_field_definition, 648 AddFieldsModel: self.create_add_fields, 649 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 650 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 651 BearerAuthenticatorModel: self.create_bearer_authenticator, 652 CheckStreamModel: self.create_check_stream, 653 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 654 CheckDynamicStreamModel: self.create_check_dynamic_stream, 655 CompositeErrorHandlerModel: self.create_composite_error_handler, 656 ConcurrencyLevelModel: self.create_concurrency_level, 657 ConfigMigrationModel: self.create_config_migration, 658 ConfigAddFieldsModel: self.create_config_add_fields, 659 ConfigRemapFieldModel: self.create_config_remap_field, 660 ConfigRemoveFieldsModel: self.create_config_remove_fields, 661 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 662 CsvDecoderModel: self.create_csv_decoder, 663 CursorPaginationModel: self.create_cursor_pagination, 664 CustomAuthenticatorModel: self.create_custom_component, 665 CustomBackoffStrategyModel: self.create_custom_component, 666 CustomDecoderModel: self.create_custom_component, 667 CustomErrorHandlerModel: self.create_custom_component, 668 CustomIncrementalSyncModel: self.create_custom_component, 669 CustomRecordExtractorModel: self.create_custom_component, 670 CustomRecordFilterModel: self.create_custom_component, 671 CustomRequesterModel: self.create_custom_component, 672 CustomRetrieverModel: self.create_custom_component, 673 CustomSchemaLoader: self.create_custom_component, 674 CustomSchemaNormalizationModel: self.create_custom_component, 675 CustomStateMigration: self.create_custom_component, 676 CustomPaginationStrategyModel: self.create_custom_component, 677 CustomPartitionRouterModel: self.create_custom_component, 678 CustomTransformationModel: self.create_custom_component, 679 DatetimeBasedCursorModel: self.create_datetime_based_cursor, 680 DeclarativeStreamModel: self.create_declarative_stream, 681 DefaultErrorHandlerModel: self.create_default_error_handler, 682 DefaultPaginatorModel: self.create_default_paginator, 683 DpathExtractorModel: self.create_dpath_extractor, 684 DpathValidatorModel: self.create_dpath_validator, 685 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 686 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 687 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 688 GroupByKeyMergeStrategyModel: self.create_group_by_key, 689 HttpRequesterModel: self.create_http_requester, 690 HttpResponseFilterModel: self.create_http_response_filter, 691 InlineSchemaLoaderModel: self.create_inline_schema_loader, 692 JsonDecoderModel: self.create_json_decoder, 693 JsonlDecoderModel: self.create_jsonl_decoder, 694 GzipDecoderModel: self.create_gzip_decoder, 695 KeysToLowerModel: self.create_keys_to_lower_transformation, 696 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 697 KeysReplaceModel: self.create_keys_replace_transformation, 698 FlattenFieldsModel: self.create_flatten_fields, 699 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 700 IterableDecoderModel: self.create_iterable_decoder, 701 IncrementingCountCursorModel: self.create_incrementing_count_cursor, 702 XmlDecoderModel: self.create_xml_decoder, 703 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 704 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 705 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 706 TypesMapModel: self.create_types_map, 707 ComplexFieldTypeModel: self.create_complex_field_type, 708 JwtAuthenticatorModel: self.create_jwt_authenticator, 709 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 710 ListPartitionRouterModel: self.create_list_partition_router, 711 MinMaxDatetimeModel: self.create_min_max_datetime, 712 NoAuthModel: self.create_no_auth, 713 NoPaginationModel: self.create_no_pagination, 714 OAuthAuthenticatorModel: self.create_oauth_authenticator, 715 OffsetIncrementModel: self.create_offset_increment, 716 PageIncrementModel: self.create_page_increment, 717 ParentStreamConfigModel: self.create_parent_stream_config, 718 PredicateValidatorModel: self.create_predicate_validator, 719 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 720 PropertyChunkingModel: self.create_property_chunking, 721 QueryPropertiesModel: self.create_query_properties, 722 RecordFilterModel: self.create_record_filter, 723 RecordSelectorModel: self.create_record_selector, 724 RemoveFieldsModel: self.create_remove_fields, 725 RequestPathModel: self.create_request_path, 726 RequestOptionModel: self.create_request_option, 727 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 728 SelectiveAuthenticatorModel: self.create_selective_authenticator, 729 SimpleRetrieverModel: self.create_simple_retriever, 730 StateDelegatingStreamModel: self.create_state_delegating_stream, 731 SpecModel: self.create_spec, 732 SubstreamPartitionRouterModel: self.create_substream_partition_router, 733 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 734 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 735 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 736 AsyncRetrieverModel: self.create_async_retriever, 737 HttpComponentsResolverModel: self.create_http_components_resolver, 738 ConfigComponentsResolverModel: self.create_config_components_resolver, 739 StreamConfigModel: self.create_stream_config, 740 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 741 ZipfileDecoderModel: self.create_zipfile_decoder, 742 HTTPAPIBudgetModel: self.create_http_api_budget, 743 FileUploaderModel: self.create_file_uploader, 744 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 745 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 746 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 747 RateModel: self.create_rate, 748 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 749 GroupingPartitionRouterModel: self.create_grouping_partition_router, 750 } 751 752 # Needed for the case where we need to perform a second parse on the fields of a custom component 753 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 754 755 def create_component( 756 self, 757 model_type: Type[BaseModel], 758 component_definition: ComponentDefinition, 759 config: Config, 760 **kwargs: Any, 761 ) -> Any: 762 """ 763 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 764 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 765 creating declarative components from that model. 766 767 :param model_type: The type of declarative component that is being initialized 768 :param component_definition: The mapping that represents a declarative component 769 :param config: The connector config that is provided by the customer 770 :return: The declarative component to be used at runtime 771 """ 772 773 component_type = component_definition.get("type") 774 if component_definition.get("type") != model_type.__name__: 775 raise ValueError( 776 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 777 ) 778 779 declarative_component_model = model_type.parse_obj(component_definition) 780 781 if not isinstance(declarative_component_model, model_type): 782 raise ValueError( 783 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 784 ) 785 786 return self._create_component_from_model( 787 model=declarative_component_model, config=config, **kwargs 788 ) 789 790 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 791 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 792 raise ValueError( 793 f"{model.__class__} with attributes {model} is not a valid component type" 794 ) 795 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 796 if not component_constructor: 797 raise ValueError(f"Could not find constructor for {model.__class__}") 798 799 # collect deprecation warnings for supported models. 800 if isinstance(model, BaseModelWithDeprecations): 801 self._collect_model_deprecations(model) 802 803 return component_constructor(model=model, config=config, **kwargs) 804 805 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 806 """ 807 Returns the deprecation warnings that were collected during the creation of components. 808 """ 809 return self._collected_deprecation_logs 810 811 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 812 """ 813 Collects deprecation logs from the given model and appends any new logs to the internal collection. 814 815 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 816 817 Args: 818 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 819 """ 820 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 821 for log in model._deprecation_logs: 822 # avoid duplicates for deprecation logs observed. 823 if log not in self._collected_deprecation_logs: 824 self._collected_deprecation_logs.append(log) 825 826 def create_config_migration( 827 self, model: ConfigMigrationModel, config: Config 828 ) -> ConfigMigration: 829 transformations: List[ConfigTransformation] = [ 830 self._create_component_from_model(transformation, config) 831 for transformation in model.transformations 832 ] 833 834 return ConfigMigration( 835 description=model.description, 836 transformations=transformations, 837 ) 838 839 def create_config_add_fields( 840 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 841 ) -> ConfigAddFields: 842 fields = [self._create_component_from_model(field, config) for field in model.fields] 843 return ConfigAddFields( 844 fields=fields, 845 condition=model.condition or "", 846 ) 847 848 @staticmethod 849 def create_config_remove_fields( 850 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 851 ) -> ConfigRemoveFields: 852 return ConfigRemoveFields( 853 field_pointers=model.field_pointers, 854 condition=model.condition or "", 855 ) 856 857 @staticmethod 858 def create_config_remap_field( 859 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 860 ) -> ConfigRemapField: 861 mapping = cast(Mapping[str, Any], model.map) 862 return ConfigRemapField( 863 map=mapping, 864 field_path=model.field_path, 865 config=config, 866 ) 867 868 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 869 strategy = self._create_component_from_model(model.validation_strategy, config) 870 871 return DpathValidator( 872 field_path=model.field_path, 873 strategy=strategy, 874 ) 875 876 def create_predicate_validator( 877 self, model: PredicateValidatorModel, config: Config 878 ) -> PredicateValidator: 879 strategy = self._create_component_from_model(model.validation_strategy, config) 880 881 return PredicateValidator( 882 value=model.value, 883 strategy=strategy, 884 ) 885 886 @staticmethod 887 def create_validate_adheres_to_schema( 888 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 889 ) -> ValidateAdheresToSchema: 890 base_schema = cast(Mapping[str, Any], model.base_schema) 891 return ValidateAdheresToSchema( 892 schema=base_schema, 893 ) 894 895 @staticmethod 896 def create_added_field_definition( 897 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 898 ) -> AddedFieldDefinition: 899 interpolated_value = InterpolatedString.create( 900 model.value, parameters=model.parameters or {} 901 ) 902 return AddedFieldDefinition( 903 path=model.path, 904 value=interpolated_value, 905 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 906 parameters=model.parameters or {}, 907 ) 908 909 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 910 added_field_definitions = [ 911 self._create_component_from_model( 912 model=added_field_definition_model, 913 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 914 added_field_definition_model.value_type 915 ), 916 config=config, 917 ) 918 for added_field_definition_model in model.fields 919 ] 920 return AddFields( 921 fields=added_field_definitions, 922 condition=model.condition or "", 923 parameters=model.parameters or {}, 924 ) 925 926 def create_keys_to_lower_transformation( 927 self, model: KeysToLowerModel, config: Config, **kwargs: Any 928 ) -> KeysToLowerTransformation: 929 return KeysToLowerTransformation() 930 931 def create_keys_to_snake_transformation( 932 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 933 ) -> KeysToSnakeCaseTransformation: 934 return KeysToSnakeCaseTransformation() 935 936 def create_keys_replace_transformation( 937 self, model: KeysReplaceModel, config: Config, **kwargs: Any 938 ) -> KeysReplaceTransformation: 939 return KeysReplaceTransformation( 940 old=model.old, new=model.new, parameters=model.parameters or {} 941 ) 942 943 def create_flatten_fields( 944 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 945 ) -> FlattenFields: 946 return FlattenFields( 947 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 948 ) 949 950 def create_dpath_flatten_fields( 951 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 952 ) -> DpathFlattenFields: 953 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 954 key_transformation = ( 955 KeyTransformation( 956 config=config, 957 prefix=model.key_transformation.prefix, 958 suffix=model.key_transformation.suffix, 959 parameters=model.parameters or {}, 960 ) 961 if model.key_transformation is not None 962 else None 963 ) 964 return DpathFlattenFields( 965 config=config, 966 field_path=model_field_path, 967 delete_origin_value=model.delete_origin_value 968 if model.delete_origin_value is not None 969 else False, 970 replace_record=model.replace_record if model.replace_record is not None else False, 971 key_transformation=key_transformation, 972 parameters=model.parameters or {}, 973 ) 974 975 @staticmethod 976 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 977 if not value_type: 978 return None 979 names_to_types = { 980 ValueType.string: str, 981 ValueType.number: float, 982 ValueType.integer: int, 983 ValueType.boolean: bool, 984 } 985 return names_to_types[value_type] 986 987 def create_api_key_authenticator( 988 self, 989 model: ApiKeyAuthenticatorModel, 990 config: Config, 991 token_provider: Optional[TokenProvider] = None, 992 **kwargs: Any, 993 ) -> ApiKeyAuthenticator: 994 if model.inject_into is None and model.header is None: 995 raise ValueError( 996 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 997 ) 998 999 if model.inject_into is not None and model.header is not None: 1000 raise ValueError( 1001 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1002 ) 1003 1004 if token_provider is not None and model.api_token != "": 1005 raise ValueError( 1006 "If token_provider is set, api_token is ignored and has to be set to empty string." 1007 ) 1008 1009 request_option = ( 1010 self._create_component_from_model( 1011 model.inject_into, config, parameters=model.parameters or {} 1012 ) 1013 if model.inject_into 1014 else RequestOption( 1015 inject_into=RequestOptionType.header, 1016 field_name=model.header or "", 1017 parameters=model.parameters or {}, 1018 ) 1019 ) 1020 1021 return ApiKeyAuthenticator( 1022 token_provider=( 1023 token_provider 1024 if token_provider is not None 1025 else InterpolatedStringTokenProvider( 1026 api_token=model.api_token or "", 1027 config=config, 1028 parameters=model.parameters or {}, 1029 ) 1030 ), 1031 request_option=request_option, 1032 config=config, 1033 parameters=model.parameters or {}, 1034 ) 1035 1036 def create_legacy_to_per_partition_state_migration( 1037 self, 1038 model: LegacyToPerPartitionStateMigrationModel, 1039 config: Mapping[str, Any], 1040 declarative_stream: DeclarativeStreamModel, 1041 ) -> LegacyToPerPartitionStateMigration: 1042 retriever = declarative_stream.retriever 1043 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1044 raise ValueError( 1045 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1046 ) 1047 partition_router = retriever.partition_router 1048 if not isinstance( 1049 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1050 ): 1051 raise ValueError( 1052 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1053 ) 1054 if not hasattr(partition_router, "parent_stream_configs"): 1055 raise ValueError( 1056 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1057 ) 1058 1059 if not hasattr(declarative_stream, "incremental_sync"): 1060 raise ValueError( 1061 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1062 ) 1063 1064 return LegacyToPerPartitionStateMigration( 1065 partition_router, # type: ignore # was already checked above 1066 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1067 config, 1068 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1069 ) 1070 1071 def create_session_token_authenticator( 1072 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1073 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1074 decoder = ( 1075 self._create_component_from_model(model=model.decoder, config=config) 1076 if model.decoder 1077 else JsonDecoder(parameters={}) 1078 ) 1079 login_requester = self._create_component_from_model( 1080 model=model.login_requester, 1081 config=config, 1082 name=f"{name}_login_requester", 1083 decoder=decoder, 1084 ) 1085 token_provider = SessionTokenProvider( 1086 login_requester=login_requester, 1087 session_token_path=model.session_token_path, 1088 expiration_duration=parse_duration(model.expiration_duration) 1089 if model.expiration_duration 1090 else None, 1091 parameters=model.parameters or {}, 1092 message_repository=self._message_repository, 1093 decoder=decoder, 1094 ) 1095 if model.request_authentication.type == "Bearer": 1096 return ModelToComponentFactory.create_bearer_authenticator( 1097 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1098 config, 1099 token_provider=token_provider, 1100 ) 1101 else: 1102 return self.create_api_key_authenticator( 1103 ApiKeyAuthenticatorModel( 1104 type="ApiKeyAuthenticator", 1105 api_token="", 1106 inject_into=model.request_authentication.inject_into, 1107 ), # type: ignore # $parameters and headers default to None 1108 config=config, 1109 token_provider=token_provider, 1110 ) 1111 1112 @staticmethod 1113 def create_basic_http_authenticator( 1114 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1115 ) -> BasicHttpAuthenticator: 1116 return BasicHttpAuthenticator( 1117 password=model.password or "", 1118 username=model.username, 1119 config=config, 1120 parameters=model.parameters or {}, 1121 ) 1122 1123 @staticmethod 1124 def create_bearer_authenticator( 1125 model: BearerAuthenticatorModel, 1126 config: Config, 1127 token_provider: Optional[TokenProvider] = None, 1128 **kwargs: Any, 1129 ) -> BearerAuthenticator: 1130 if token_provider is not None and model.api_token != "": 1131 raise ValueError( 1132 "If token_provider is set, api_token is ignored and has to be set to empty string." 1133 ) 1134 return BearerAuthenticator( 1135 token_provider=( 1136 token_provider 1137 if token_provider is not None 1138 else InterpolatedStringTokenProvider( 1139 api_token=model.api_token or "", 1140 config=config, 1141 parameters=model.parameters or {}, 1142 ) 1143 ), 1144 config=config, 1145 parameters=model.parameters or {}, 1146 ) 1147 1148 @staticmethod 1149 def create_dynamic_stream_check_config( 1150 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1151 ) -> DynamicStreamCheckConfig: 1152 return DynamicStreamCheckConfig( 1153 dynamic_stream_name=model.dynamic_stream_name, 1154 stream_count=model.stream_count or 0, 1155 ) 1156 1157 def create_check_stream( 1158 self, model: CheckStreamModel, config: Config, **kwargs: Any 1159 ) -> CheckStream: 1160 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1161 raise ValueError( 1162 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1163 ) 1164 1165 dynamic_streams_check_configs = ( 1166 [ 1167 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1168 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1169 ] 1170 if model.dynamic_streams_check_configs 1171 else [] 1172 ) 1173 1174 return CheckStream( 1175 stream_names=model.stream_names or [], 1176 dynamic_streams_check_configs=dynamic_streams_check_configs, 1177 parameters={}, 1178 ) 1179 1180 @staticmethod 1181 def create_check_dynamic_stream( 1182 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1183 ) -> CheckDynamicStream: 1184 assert model.use_check_availability is not None # for mypy 1185 1186 use_check_availability = model.use_check_availability 1187 1188 return CheckDynamicStream( 1189 stream_count=model.stream_count, 1190 use_check_availability=use_check_availability, 1191 parameters={}, 1192 ) 1193 1194 def create_composite_error_handler( 1195 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1196 ) -> CompositeErrorHandler: 1197 error_handlers = [ 1198 self._create_component_from_model(model=error_handler_model, config=config) 1199 for error_handler_model in model.error_handlers 1200 ] 1201 return CompositeErrorHandler( 1202 error_handlers=error_handlers, parameters=model.parameters or {} 1203 ) 1204 1205 @staticmethod 1206 def create_concurrency_level( 1207 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1208 ) -> ConcurrencyLevel: 1209 return ConcurrencyLevel( 1210 default_concurrency=model.default_concurrency, 1211 max_concurrency=model.max_concurrency, 1212 config=config, 1213 parameters={}, 1214 ) 1215 1216 @staticmethod 1217 def apply_stream_state_migrations( 1218 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1219 ) -> MutableMapping[str, Any]: 1220 if stream_state_migrations: 1221 for state_migration in stream_state_migrations: 1222 if state_migration.should_migrate(stream_state): 1223 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1224 stream_state = dict(state_migration.migrate(stream_state)) 1225 return stream_state 1226 1227 def create_concurrent_cursor_from_datetime_based_cursor( 1228 self, 1229 model_type: Type[BaseModel], 1230 component_definition: ComponentDefinition, 1231 stream_name: str, 1232 stream_namespace: Optional[str], 1233 config: Config, 1234 message_repository: Optional[MessageRepository] = None, 1235 runtime_lookback_window: Optional[datetime.timedelta] = None, 1236 stream_state_migrations: Optional[List[Any]] = None, 1237 **kwargs: Any, 1238 ) -> ConcurrentCursor: 1239 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1240 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1241 # incoming state and connector_state_manager that is initialized when the component factory is created 1242 stream_state = ( 1243 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1244 if "stream_state" not in kwargs 1245 else kwargs["stream_state"] 1246 ) 1247 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1248 1249 component_type = component_definition.get("type") 1250 if component_definition.get("type") != model_type.__name__: 1251 raise ValueError( 1252 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1253 ) 1254 1255 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1256 1257 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1258 raise ValueError( 1259 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1260 ) 1261 1262 interpolated_cursor_field = InterpolatedString.create( 1263 datetime_based_cursor_model.cursor_field, 1264 parameters=datetime_based_cursor_model.parameters or {}, 1265 ) 1266 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1267 1268 interpolated_partition_field_start = InterpolatedString.create( 1269 datetime_based_cursor_model.partition_field_start or "start_time", 1270 parameters=datetime_based_cursor_model.parameters or {}, 1271 ) 1272 interpolated_partition_field_end = InterpolatedString.create( 1273 datetime_based_cursor_model.partition_field_end or "end_time", 1274 parameters=datetime_based_cursor_model.parameters or {}, 1275 ) 1276 1277 slice_boundary_fields = ( 1278 interpolated_partition_field_start.eval(config=config), 1279 interpolated_partition_field_end.eval(config=config), 1280 ) 1281 1282 datetime_format = datetime_based_cursor_model.datetime_format 1283 1284 cursor_granularity = ( 1285 parse_duration(datetime_based_cursor_model.cursor_granularity) 1286 if datetime_based_cursor_model.cursor_granularity 1287 else None 1288 ) 1289 1290 lookback_window = None 1291 interpolated_lookback_window = ( 1292 InterpolatedString.create( 1293 datetime_based_cursor_model.lookback_window, 1294 parameters=datetime_based_cursor_model.parameters or {}, 1295 ) 1296 if datetime_based_cursor_model.lookback_window 1297 else None 1298 ) 1299 if interpolated_lookback_window: 1300 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1301 if evaluated_lookback_window: 1302 lookback_window = parse_duration(evaluated_lookback_window) 1303 1304 connector_state_converter: DateTimeStreamStateConverter 1305 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1306 datetime_format=datetime_format, 1307 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1308 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1309 cursor_granularity=cursor_granularity, 1310 ) 1311 1312 # Adjusts the stream state by applying the runtime lookback window. 1313 # This is used to ensure correct state handling in case of failed partitions. 1314 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1315 if runtime_lookback_window and stream_state_value: 1316 new_stream_state = ( 1317 connector_state_converter.parse_timestamp(stream_state_value) 1318 - runtime_lookback_window 1319 ) 1320 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1321 new_stream_state 1322 ) 1323 1324 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1325 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1326 start_date_runtime_value = self.create_min_max_datetime( 1327 model=datetime_based_cursor_model.start_datetime, config=config 1328 ) 1329 else: 1330 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1331 1332 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1333 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1334 end_date_runtime_value = self.create_min_max_datetime( 1335 model=datetime_based_cursor_model.end_datetime, config=config 1336 ) 1337 else: 1338 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1339 1340 interpolated_start_date = MinMaxDatetime.create( 1341 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1342 parameters=datetime_based_cursor_model.parameters, 1343 ) 1344 interpolated_end_date = ( 1345 None 1346 if not end_date_runtime_value 1347 else MinMaxDatetime.create( 1348 end_date_runtime_value, datetime_based_cursor_model.parameters 1349 ) 1350 ) 1351 1352 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1353 if not interpolated_start_date.datetime_format: 1354 interpolated_start_date.datetime_format = datetime_format 1355 if interpolated_end_date and not interpolated_end_date.datetime_format: 1356 interpolated_end_date.datetime_format = datetime_format 1357 1358 start_date = interpolated_start_date.get_datetime(config=config) 1359 end_date_provider = ( 1360 partial(interpolated_end_date.get_datetime, config) 1361 if interpolated_end_date 1362 else connector_state_converter.get_end_provider() 1363 ) 1364 1365 if ( 1366 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1367 ) or ( 1368 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1369 ): 1370 raise ValueError( 1371 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1372 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1373 ) 1374 1375 # When step is not defined, default to a step size from the starting date to the present moment 1376 step_length = datetime.timedelta.max 1377 interpolated_step = ( 1378 InterpolatedString.create( 1379 datetime_based_cursor_model.step, 1380 parameters=datetime_based_cursor_model.parameters or {}, 1381 ) 1382 if datetime_based_cursor_model.step 1383 else None 1384 ) 1385 if interpolated_step: 1386 evaluated_step = interpolated_step.eval(config) 1387 if evaluated_step: 1388 step_length = parse_duration(evaluated_step) 1389 1390 clamping_strategy: ClampingStrategy = NoClamping() 1391 if datetime_based_cursor_model.clamping: 1392 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1393 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1394 # object which we want to keep agnostic of being low-code 1395 target = InterpolatedString( 1396 string=datetime_based_cursor_model.clamping.target, 1397 parameters=datetime_based_cursor_model.parameters or {}, 1398 ) 1399 evaluated_target = target.eval(config=config) 1400 match evaluated_target: 1401 case "DAY": 1402 clamping_strategy = DayClampingStrategy() 1403 end_date_provider = ClampingEndProvider( 1404 DayClampingStrategy(is_ceiling=False), 1405 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1406 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1407 ) 1408 case "WEEK": 1409 if ( 1410 not datetime_based_cursor_model.clamping.target_details 1411 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1412 ): 1413 raise ValueError( 1414 "Given WEEK clamping, weekday needs to be provided as target_details" 1415 ) 1416 weekday = self._assemble_weekday( 1417 datetime_based_cursor_model.clamping.target_details["weekday"] 1418 ) 1419 clamping_strategy = WeekClampingStrategy(weekday) 1420 end_date_provider = ClampingEndProvider( 1421 WeekClampingStrategy(weekday, is_ceiling=False), 1422 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1423 granularity=cursor_granularity or datetime.timedelta(days=1), 1424 ) 1425 case "MONTH": 1426 clamping_strategy = MonthClampingStrategy() 1427 end_date_provider = ClampingEndProvider( 1428 MonthClampingStrategy(is_ceiling=False), 1429 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1430 granularity=cursor_granularity or datetime.timedelta(days=1), 1431 ) 1432 case _: 1433 raise ValueError( 1434 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1435 ) 1436 1437 return ConcurrentCursor( 1438 stream_name=stream_name, 1439 stream_namespace=stream_namespace, 1440 stream_state=stream_state, 1441 message_repository=message_repository or self._message_repository, 1442 connector_state_manager=self._connector_state_manager, 1443 connector_state_converter=connector_state_converter, 1444 cursor_field=cursor_field, 1445 slice_boundary_fields=slice_boundary_fields, 1446 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1447 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1448 lookback_window=lookback_window, 1449 slice_range=step_length, 1450 cursor_granularity=cursor_granularity, 1451 clamping_strategy=clamping_strategy, 1452 ) 1453 1454 def create_concurrent_cursor_from_incrementing_count_cursor( 1455 self, 1456 model_type: Type[BaseModel], 1457 component_definition: ComponentDefinition, 1458 stream_name: str, 1459 stream_namespace: Optional[str], 1460 config: Config, 1461 message_repository: Optional[MessageRepository] = None, 1462 **kwargs: Any, 1463 ) -> ConcurrentCursor: 1464 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1465 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1466 # incoming state and connector_state_manager that is initialized when the component factory is created 1467 stream_state = ( 1468 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1469 if "stream_state" not in kwargs 1470 else kwargs["stream_state"] 1471 ) 1472 1473 component_type = component_definition.get("type") 1474 if component_definition.get("type") != model_type.__name__: 1475 raise ValueError( 1476 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1477 ) 1478 1479 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1480 1481 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1482 raise ValueError( 1483 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1484 ) 1485 1486 interpolated_start_value = ( 1487 InterpolatedString.create( 1488 incrementing_count_cursor_model.start_value, # type: ignore 1489 parameters=incrementing_count_cursor_model.parameters or {}, 1490 ) 1491 if incrementing_count_cursor_model.start_value 1492 else 0 1493 ) 1494 1495 interpolated_cursor_field = InterpolatedString.create( 1496 incrementing_count_cursor_model.cursor_field, 1497 parameters=incrementing_count_cursor_model.parameters or {}, 1498 ) 1499 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1500 1501 connector_state_converter = IncrementingCountStreamStateConverter( 1502 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1503 ) 1504 1505 return ConcurrentCursor( 1506 stream_name=stream_name, 1507 stream_namespace=stream_namespace, 1508 stream_state=stream_state, 1509 message_repository=message_repository or self._message_repository, 1510 connector_state_manager=self._connector_state_manager, 1511 connector_state_converter=connector_state_converter, 1512 cursor_field=cursor_field, 1513 slice_boundary_fields=None, 1514 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1515 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1516 ) 1517 1518 def _assemble_weekday(self, weekday: str) -> Weekday: 1519 match weekday: 1520 case "MONDAY": 1521 return Weekday.MONDAY 1522 case "TUESDAY": 1523 return Weekday.TUESDAY 1524 case "WEDNESDAY": 1525 return Weekday.WEDNESDAY 1526 case "THURSDAY": 1527 return Weekday.THURSDAY 1528 case "FRIDAY": 1529 return Weekday.FRIDAY 1530 case "SATURDAY": 1531 return Weekday.SATURDAY 1532 case "SUNDAY": 1533 return Weekday.SUNDAY 1534 case _: 1535 raise ValueError(f"Unknown weekday {weekday}") 1536 1537 def create_concurrent_cursor_from_perpartition_cursor( 1538 self, 1539 state_manager: ConnectorStateManager, 1540 model_type: Type[BaseModel], 1541 component_definition: ComponentDefinition, 1542 stream_name: str, 1543 stream_namespace: Optional[str], 1544 config: Config, 1545 stream_state: MutableMapping[str, Any], 1546 partition_router: PartitionRouter, 1547 stream_state_migrations: Optional[List[Any]] = None, 1548 **kwargs: Any, 1549 ) -> ConcurrentPerPartitionCursor: 1550 component_type = component_definition.get("type") 1551 if component_definition.get("type") != model_type.__name__: 1552 raise ValueError( 1553 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1554 ) 1555 1556 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1557 1558 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1559 raise ValueError( 1560 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1561 ) 1562 1563 interpolated_cursor_field = InterpolatedString.create( 1564 datetime_based_cursor_model.cursor_field, 1565 parameters=datetime_based_cursor_model.parameters or {}, 1566 ) 1567 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1568 1569 datetime_format = datetime_based_cursor_model.datetime_format 1570 1571 cursor_granularity = ( 1572 parse_duration(datetime_based_cursor_model.cursor_granularity) 1573 if datetime_based_cursor_model.cursor_granularity 1574 else None 1575 ) 1576 1577 connector_state_converter: DateTimeStreamStateConverter 1578 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1579 datetime_format=datetime_format, 1580 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1581 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1582 cursor_granularity=cursor_granularity, 1583 ) 1584 1585 # Create the cursor factory 1586 cursor_factory = ConcurrentCursorFactory( 1587 partial( 1588 self.create_concurrent_cursor_from_datetime_based_cursor, 1589 state_manager=state_manager, 1590 model_type=model_type, 1591 component_definition=component_definition, 1592 stream_name=stream_name, 1593 stream_namespace=stream_namespace, 1594 config=config, 1595 message_repository=NoopMessageRepository(), 1596 stream_state_migrations=stream_state_migrations, 1597 ) 1598 ) 1599 1600 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1601 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1602 use_global_cursor = isinstance( 1603 partition_router, GroupingPartitionRouter 1604 ) or component_definition.get("global_substream_cursor", False) 1605 1606 # Return the concurrent cursor and state converter 1607 return ConcurrentPerPartitionCursor( 1608 cursor_factory=cursor_factory, 1609 partition_router=partition_router, 1610 stream_name=stream_name, 1611 stream_namespace=stream_namespace, 1612 stream_state=stream_state, 1613 message_repository=self._message_repository, # type: ignore 1614 connector_state_manager=state_manager, 1615 connector_state_converter=connector_state_converter, 1616 cursor_field=cursor_field, 1617 use_global_cursor=use_global_cursor, 1618 ) 1619 1620 @staticmethod 1621 def create_constant_backoff_strategy( 1622 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1623 ) -> ConstantBackoffStrategy: 1624 return ConstantBackoffStrategy( 1625 backoff_time_in_seconds=model.backoff_time_in_seconds, 1626 config=config, 1627 parameters=model.parameters or {}, 1628 ) 1629 1630 def create_cursor_pagination( 1631 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1632 ) -> CursorPaginationStrategy: 1633 if isinstance(decoder, PaginationDecoderDecorator): 1634 inner_decoder = decoder.decoder 1635 else: 1636 inner_decoder = decoder 1637 decoder = PaginationDecoderDecorator(decoder=decoder) 1638 1639 if self._is_supported_decoder_for_pagination(inner_decoder): 1640 decoder_to_use = decoder 1641 else: 1642 raise ValueError( 1643 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1644 ) 1645 1646 return CursorPaginationStrategy( 1647 cursor_value=model.cursor_value, 1648 decoder=decoder_to_use, 1649 page_size=model.page_size, 1650 stop_condition=model.stop_condition, 1651 config=config, 1652 parameters=model.parameters or {}, 1653 ) 1654 1655 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1656 """ 1657 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1658 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1659 :param model: The Pydantic model of the custom component being created 1660 :param config: The custom defined connector config 1661 :return: The declarative component built from the Pydantic model to be used at runtime 1662 """ 1663 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1664 component_fields = get_type_hints(custom_component_class) 1665 model_args = model.dict() 1666 model_args["config"] = config 1667 1668 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1669 # we defer to these arguments over the component's definition 1670 for key, arg in kwargs.items(): 1671 model_args[key] = arg 1672 1673 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1674 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1675 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1676 for model_field, model_value in model_args.items(): 1677 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1678 if ( 1679 isinstance(model_value, dict) 1680 and "type" not in model_value 1681 and model_field in component_fields 1682 ): 1683 derived_type = self._derive_component_type_from_type_hints( 1684 component_fields.get(model_field) 1685 ) 1686 if derived_type: 1687 model_value["type"] = derived_type 1688 1689 if self._is_component(model_value): 1690 model_args[model_field] = self._create_nested_component( 1691 model, model_field, model_value, config 1692 ) 1693 elif isinstance(model_value, list): 1694 vals = [] 1695 for v in model_value: 1696 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1697 derived_type = self._derive_component_type_from_type_hints( 1698 component_fields.get(model_field) 1699 ) 1700 if derived_type: 1701 v["type"] = derived_type 1702 if self._is_component(v): 1703 vals.append(self._create_nested_component(model, model_field, v, config)) 1704 else: 1705 vals.append(v) 1706 model_args[model_field] = vals 1707 1708 kwargs = { 1709 class_field: model_args[class_field] 1710 for class_field in component_fields.keys() 1711 if class_field in model_args 1712 } 1713 return custom_component_class(**kwargs) 1714 1715 @staticmethod 1716 def _get_class_from_fully_qualified_class_name( 1717 full_qualified_class_name: str, 1718 ) -> Any: 1719 """Get a class from its fully qualified name. 1720 1721 If a custom components module is needed, we assume it is already registered - probably 1722 as `source_declarative_manifest.components` or `components`. 1723 1724 Args: 1725 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1726 1727 Returns: 1728 Any: The class object. 1729 1730 Raises: 1731 ValueError: If the class cannot be loaded. 1732 """ 1733 split = full_qualified_class_name.split(".") 1734 module_name_full = ".".join(split[:-1]) 1735 class_name = split[-1] 1736 1737 try: 1738 module_ref = importlib.import_module(module_name_full) 1739 except ModuleNotFoundError as e: 1740 if split[0] == "source_declarative_manifest": 1741 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1742 try: 1743 import os 1744 1745 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1746 module_ref = importlib.import_module( 1747 module_name_with_source_declarative_manifest 1748 ) 1749 except ModuleNotFoundError: 1750 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1751 else: 1752 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1753 1754 try: 1755 return getattr(module_ref, class_name) 1756 except AttributeError as e: 1757 raise ValueError( 1758 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1759 ) from e 1760 1761 @staticmethod 1762 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1763 interface = field_type 1764 while True: 1765 origin = get_origin(interface) 1766 if origin: 1767 # Unnest types until we reach the raw type 1768 # List[T] -> T 1769 # Optional[List[T]] -> T 1770 args = get_args(interface) 1771 interface = args[0] 1772 else: 1773 break 1774 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1775 return interface.__name__ 1776 return None 1777 1778 @staticmethod 1779 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1780 if not cls: 1781 return False 1782 return cls.__module__ == "builtins" 1783 1784 @staticmethod 1785 def _extract_missing_parameters(error: TypeError) -> List[str]: 1786 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1787 if parameter_search: 1788 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1789 else: 1790 return [] 1791 1792 def _create_nested_component( 1793 self, model: Any, model_field: str, model_value: Any, config: Config 1794 ) -> Any: 1795 type_name = model_value.get("type", None) 1796 if not type_name: 1797 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1798 return model_value 1799 1800 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1801 if model_type: 1802 parsed_model = model_type.parse_obj(model_value) 1803 try: 1804 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1805 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1806 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1807 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1808 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1809 # are needed by a component and could not be shared. 1810 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1811 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1812 model_parameters = model_value.get("$parameters", {}) 1813 matching_parameters = { 1814 kwarg: model_parameters[kwarg] 1815 for kwarg in constructor_kwargs 1816 if kwarg in model_parameters 1817 } 1818 return self._create_component_from_model( 1819 model=parsed_model, config=config, **matching_parameters 1820 ) 1821 except TypeError as error: 1822 missing_parameters = self._extract_missing_parameters(error) 1823 if missing_parameters: 1824 raise ValueError( 1825 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1826 + ", ".join( 1827 ( 1828 f"{type_name}.$parameters.{parameter}" 1829 for parameter in missing_parameters 1830 ) 1831 ) 1832 ) 1833 raise TypeError( 1834 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1835 ) 1836 else: 1837 raise ValueError( 1838 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1839 ) 1840 1841 @staticmethod 1842 def _is_component(model_value: Any) -> bool: 1843 return isinstance(model_value, dict) and model_value.get("type") is not None 1844 1845 def create_datetime_based_cursor( 1846 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1847 ) -> DatetimeBasedCursor: 1848 start_datetime: Union[str, MinMaxDatetime] = ( 1849 model.start_datetime 1850 if isinstance(model.start_datetime, str) 1851 else self.create_min_max_datetime(model.start_datetime, config) 1852 ) 1853 end_datetime: Union[str, MinMaxDatetime, None] = None 1854 if model.is_data_feed and model.end_datetime: 1855 raise ValueError("Data feed does not support end_datetime") 1856 if model.is_data_feed and model.is_client_side_incremental: 1857 raise ValueError( 1858 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1859 ) 1860 if model.end_datetime: 1861 end_datetime = ( 1862 model.end_datetime 1863 if isinstance(model.end_datetime, str) 1864 else self.create_min_max_datetime(model.end_datetime, config) 1865 ) 1866 1867 end_time_option = ( 1868 self._create_component_from_model( 1869 model.end_time_option, config, parameters=model.parameters or {} 1870 ) 1871 if model.end_time_option 1872 else None 1873 ) 1874 start_time_option = ( 1875 self._create_component_from_model( 1876 model.start_time_option, config, parameters=model.parameters or {} 1877 ) 1878 if model.start_time_option 1879 else None 1880 ) 1881 1882 return DatetimeBasedCursor( 1883 cursor_field=model.cursor_field, 1884 cursor_datetime_formats=model.cursor_datetime_formats 1885 if model.cursor_datetime_formats 1886 else [], 1887 cursor_granularity=model.cursor_granularity, 1888 datetime_format=model.datetime_format, 1889 end_datetime=end_datetime, 1890 start_datetime=start_datetime, 1891 step=model.step, 1892 end_time_option=end_time_option, 1893 lookback_window=model.lookback_window, 1894 start_time_option=start_time_option, 1895 partition_field_end=model.partition_field_end, 1896 partition_field_start=model.partition_field_start, 1897 message_repository=self._message_repository, 1898 is_compare_strictly=model.is_compare_strictly, 1899 config=config, 1900 parameters=model.parameters or {}, 1901 ) 1902 1903 def create_declarative_stream( 1904 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1905 ) -> DeclarativeStream: 1906 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1907 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1908 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1909 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1910 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1911 1912 primary_key = model.primary_key.__root__ if model.primary_key else None 1913 stop_condition_on_cursor = ( 1914 model.incremental_sync 1915 and hasattr(model.incremental_sync, "is_data_feed") 1916 and model.incremental_sync.is_data_feed 1917 ) 1918 client_side_incremental_sync = None 1919 if ( 1920 model.incremental_sync 1921 and hasattr(model.incremental_sync, "is_client_side_incremental") 1922 and model.incremental_sync.is_client_side_incremental 1923 ): 1924 supported_slicers = ( 1925 DatetimeBasedCursor, 1926 GlobalSubstreamCursor, 1927 PerPartitionWithGlobalCursor, 1928 ) 1929 if combined_slicers and not isinstance(combined_slicers, supported_slicers): 1930 raise ValueError( 1931 "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead" 1932 ) 1933 cursor = ( 1934 combined_slicers 1935 if isinstance( 1936 combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor) 1937 ) 1938 else self._create_component_from_model(model=model.incremental_sync, config=config) 1939 ) 1940 1941 client_side_incremental_sync = {"cursor": cursor} 1942 1943 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1944 cursor_model = model.incremental_sync 1945 1946 end_time_option = ( 1947 self._create_component_from_model( 1948 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1949 ) 1950 if cursor_model.end_time_option 1951 else None 1952 ) 1953 start_time_option = ( 1954 self._create_component_from_model( 1955 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1956 ) 1957 if cursor_model.start_time_option 1958 else None 1959 ) 1960 1961 request_options_provider = DatetimeBasedRequestOptionsProvider( 1962 start_time_option=start_time_option, 1963 end_time_option=end_time_option, 1964 partition_field_start=cursor_model.partition_field_end, 1965 partition_field_end=cursor_model.partition_field_end, 1966 config=config, 1967 parameters=model.parameters or {}, 1968 ) 1969 elif model.incremental_sync and isinstance( 1970 model.incremental_sync, IncrementingCountCursorModel 1971 ): 1972 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1973 1974 start_time_option = ( 1975 self._create_component_from_model( 1976 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1977 config, 1978 parameters=cursor_model.parameters or {}, 1979 ) 1980 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1981 else None 1982 ) 1983 1984 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1985 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1986 partition_field_start = "start" 1987 1988 request_options_provider = DatetimeBasedRequestOptionsProvider( 1989 start_time_option=start_time_option, 1990 partition_field_start=partition_field_start, 1991 config=config, 1992 parameters=model.parameters or {}, 1993 ) 1994 else: 1995 request_options_provider = None 1996 1997 transformations = [] 1998 if model.transformations: 1999 for transformation_model in model.transformations: 2000 transformations.append( 2001 self._create_component_from_model(model=transformation_model, config=config) 2002 ) 2003 file_uploader = None 2004 if model.file_uploader: 2005 file_uploader = self._create_component_from_model( 2006 model=model.file_uploader, config=config 2007 ) 2008 2009 retriever = self._create_component_from_model( 2010 model=model.retriever, 2011 config=config, 2012 name=model.name, 2013 primary_key=primary_key, 2014 stream_slicer=combined_slicers, 2015 request_options_provider=request_options_provider, 2016 stop_condition_on_cursor=stop_condition_on_cursor, 2017 client_side_incremental_sync=client_side_incremental_sync, 2018 transformations=transformations, 2019 file_uploader=file_uploader, 2020 incremental_sync=model.incremental_sync, 2021 ) 2022 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 2023 2024 if model.state_migrations: 2025 state_transformations = [ 2026 self._create_component_from_model(state_migration, config, declarative_stream=model) 2027 for state_migration in model.state_migrations 2028 ] 2029 else: 2030 state_transformations = [] 2031 2032 schema_loader: Union[ 2033 CompositeSchemaLoader, 2034 DefaultSchemaLoader, 2035 DynamicSchemaLoader, 2036 InlineSchemaLoader, 2037 JsonFileSchemaLoader, 2038 ] 2039 if model.schema_loader and isinstance(model.schema_loader, list): 2040 nested_schema_loaders = [ 2041 self._create_component_from_model(model=nested_schema_loader, config=config) 2042 for nested_schema_loader in model.schema_loader 2043 ] 2044 schema_loader = CompositeSchemaLoader( 2045 schema_loaders=nested_schema_loaders, parameters={} 2046 ) 2047 elif model.schema_loader: 2048 schema_loader = self._create_component_from_model( 2049 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2050 config=config, 2051 ) 2052 else: 2053 options = model.parameters or {} 2054 if "name" not in options: 2055 options["name"] = model.name 2056 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2057 2058 return DeclarativeStream( 2059 name=model.name or "", 2060 primary_key=primary_key, 2061 retriever=retriever, 2062 schema_loader=schema_loader, 2063 stream_cursor_field=cursor_field or "", 2064 state_migrations=state_transformations, 2065 config=config, 2066 parameters=model.parameters or {}, 2067 ) 2068 2069 def _build_stream_slicer_from_partition_router( 2070 self, 2071 model: Union[ 2072 AsyncRetrieverModel, 2073 CustomRetrieverModel, 2074 SimpleRetrieverModel, 2075 ], 2076 config: Config, 2077 stream_name: Optional[str] = None, 2078 ) -> Optional[PartitionRouter]: 2079 if ( 2080 hasattr(model, "partition_router") 2081 and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel) 2082 and model.partition_router 2083 ): 2084 stream_slicer_model = model.partition_router 2085 if isinstance(stream_slicer_model, list): 2086 return CartesianProductStreamSlicer( 2087 [ 2088 self._create_component_from_model( 2089 model=slicer, config=config, stream_name=stream_name or "" 2090 ) 2091 for slicer in stream_slicer_model 2092 ], 2093 parameters={}, 2094 ) 2095 else: 2096 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2097 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2098 ) 2099 return None 2100 2101 def _build_incremental_cursor( 2102 self, 2103 model: DeclarativeStreamModel, 2104 stream_slicer: Optional[PartitionRouter], 2105 config: Config, 2106 ) -> Optional[StreamSlicer]: 2107 if model.incremental_sync and stream_slicer: 2108 if model.retriever.type == "AsyncRetriever": 2109 stream_name = model.name or "" 2110 stream_namespace = None 2111 stream_state = self._connector_state_manager.get_stream_state( 2112 stream_name, stream_namespace 2113 ) 2114 state_transformations = ( 2115 [ 2116 self._create_component_from_model( 2117 state_migration, config, declarative_stream=model 2118 ) 2119 for state_migration in model.state_migrations 2120 ] 2121 if model.state_migrations 2122 else [] 2123 ) 2124 2125 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2126 state_manager=self._connector_state_manager, 2127 model_type=DatetimeBasedCursorModel, 2128 component_definition=model.incremental_sync.__dict__, 2129 stream_name=stream_name, 2130 stream_namespace=stream_namespace, 2131 config=config or {}, 2132 stream_state=stream_state, 2133 stream_state_migrations=state_transformations, 2134 partition_router=stream_slicer, 2135 ) 2136 2137 incremental_sync_model = model.incremental_sync 2138 cursor_component = self._create_component_from_model( 2139 model=incremental_sync_model, config=config 2140 ) 2141 is_global_cursor = ( 2142 hasattr(incremental_sync_model, "global_substream_cursor") 2143 and incremental_sync_model.global_substream_cursor 2144 ) 2145 2146 if is_global_cursor: 2147 return GlobalSubstreamCursor( 2148 stream_cursor=cursor_component, partition_router=stream_slicer 2149 ) 2150 return PerPartitionWithGlobalCursor( 2151 cursor_factory=CursorFactory( 2152 lambda: self._create_component_from_model( 2153 model=incremental_sync_model, config=config 2154 ), 2155 ), 2156 partition_router=stream_slicer, 2157 stream_cursor=cursor_component, 2158 ) 2159 elif model.incremental_sync: 2160 if model.retriever.type == "AsyncRetriever": 2161 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2162 model_type=DatetimeBasedCursorModel, 2163 component_definition=model.incremental_sync.__dict__, 2164 stream_name=model.name or "", 2165 stream_namespace=None, 2166 config=config or {}, 2167 stream_state_migrations=model.state_migrations, 2168 ) 2169 return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync 2170 return None 2171 2172 def _build_resumable_cursor( 2173 self, 2174 model: Union[ 2175 AsyncRetrieverModel, 2176 CustomRetrieverModel, 2177 SimpleRetrieverModel, 2178 ], 2179 stream_slicer: Optional[PartitionRouter], 2180 ) -> Optional[StreamSlicer]: 2181 if hasattr(model, "paginator") and model.paginator and not stream_slicer: 2182 # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` 2183 return ResumableFullRefreshCursor(parameters={}) 2184 elif stream_slicer: 2185 # For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor` 2186 return PerPartitionCursor( 2187 cursor_factory=CursorFactory( 2188 create_function=partial(ChildPartitionResumableFullRefreshCursor, {}) 2189 ), 2190 partition_router=stream_slicer, 2191 ) 2192 return None 2193 2194 def _merge_stream_slicers( 2195 self, model: DeclarativeStreamModel, config: Config 2196 ) -> Optional[StreamSlicer]: 2197 retriever_model = model.retriever 2198 2199 stream_slicer = self._build_stream_slicer_from_partition_router( 2200 retriever_model, config, stream_name=model.name 2201 ) 2202 2203 if retriever_model.type == "AsyncRetriever": 2204 is_not_datetime_cursor = ( 2205 model.incremental_sync.type != "DatetimeBasedCursor" 2206 if model.incremental_sync 2207 else None 2208 ) 2209 is_partition_router = ( 2210 bool(retriever_model.partition_router) if model.incremental_sync else None 2211 ) 2212 2213 if is_not_datetime_cursor: 2214 # We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the 2215 # support or unordered slices (for example, when we trigger reports for January and February, the report 2216 # in February can be completed first). Once we have support for custom concurrent cursor or have a new 2217 # implementation available in the CDK, we can enable more cursors here. 2218 raise ValueError( 2219 "AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet." 2220 ) 2221 2222 if is_partition_router and not stream_slicer: 2223 # Note that this development is also done in parallel to the per partition development which once merged 2224 # we could support here by calling create_concurrent_cursor_from_perpartition_cursor 2225 raise ValueError("Per partition state is not supported yet for AsyncRetriever.") 2226 2227 if model.incremental_sync: 2228 return self._build_incremental_cursor(model, stream_slicer, config) 2229 2230 return ( 2231 stream_slicer 2232 if self._disable_resumable_full_refresh 2233 else self._build_resumable_cursor(retriever_model, stream_slicer) 2234 ) 2235 2236 def create_default_error_handler( 2237 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2238 ) -> DefaultErrorHandler: 2239 backoff_strategies = [] 2240 if model.backoff_strategies: 2241 for backoff_strategy_model in model.backoff_strategies: 2242 backoff_strategies.append( 2243 self._create_component_from_model(model=backoff_strategy_model, config=config) 2244 ) 2245 2246 response_filters = [] 2247 if model.response_filters: 2248 for response_filter_model in model.response_filters: 2249 response_filters.append( 2250 self._create_component_from_model(model=response_filter_model, config=config) 2251 ) 2252 response_filters.append( 2253 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2254 ) 2255 2256 return DefaultErrorHandler( 2257 backoff_strategies=backoff_strategies, 2258 max_retries=model.max_retries, 2259 response_filters=response_filters, 2260 config=config, 2261 parameters=model.parameters or {}, 2262 ) 2263 2264 def create_default_paginator( 2265 self, 2266 model: DefaultPaginatorModel, 2267 config: Config, 2268 *, 2269 url_base: str, 2270 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2271 decoder: Optional[Decoder] = None, 2272 cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None, 2273 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2274 if decoder: 2275 if self._is_supported_decoder_for_pagination(decoder): 2276 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2277 else: 2278 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2279 else: 2280 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2281 page_size_option = ( 2282 self._create_component_from_model(model=model.page_size_option, config=config) 2283 if model.page_size_option 2284 else None 2285 ) 2286 page_token_option = ( 2287 self._create_component_from_model(model=model.page_token_option, config=config) 2288 if model.page_token_option 2289 else None 2290 ) 2291 pagination_strategy = self._create_component_from_model( 2292 model=model.pagination_strategy, 2293 config=config, 2294 decoder=decoder_to_use, 2295 extractor_model=extractor_model, 2296 ) 2297 if cursor_used_for_stop_condition: 2298 pagination_strategy = StopConditionPaginationStrategyDecorator( 2299 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2300 ) 2301 paginator = DefaultPaginator( 2302 decoder=decoder_to_use, 2303 page_size_option=page_size_option, 2304 page_token_option=page_token_option, 2305 pagination_strategy=pagination_strategy, 2306 url_base=url_base, 2307 config=config, 2308 parameters=model.parameters or {}, 2309 ) 2310 if self._limit_pages_fetched_per_slice: 2311 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2312 return paginator 2313 2314 def create_dpath_extractor( 2315 self, 2316 model: DpathExtractorModel, 2317 config: Config, 2318 decoder: Optional[Decoder] = None, 2319 **kwargs: Any, 2320 ) -> DpathExtractor: 2321 if decoder: 2322 decoder_to_use = decoder 2323 else: 2324 decoder_to_use = JsonDecoder(parameters={}) 2325 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2326 return DpathExtractor( 2327 decoder=decoder_to_use, 2328 field_path=model_field_path, 2329 config=config, 2330 parameters=model.parameters or {}, 2331 ) 2332 2333 @staticmethod 2334 def create_response_to_file_extractor( 2335 model: ResponseToFileExtractorModel, 2336 **kwargs: Any, 2337 ) -> ResponseToFileExtractor: 2338 return ResponseToFileExtractor(parameters=model.parameters or {}) 2339 2340 @staticmethod 2341 def create_exponential_backoff_strategy( 2342 model: ExponentialBackoffStrategyModel, config: Config 2343 ) -> ExponentialBackoffStrategy: 2344 return ExponentialBackoffStrategy( 2345 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2346 ) 2347 2348 @staticmethod 2349 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2350 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2351 2352 def create_http_requester( 2353 self, 2354 model: HttpRequesterModel, 2355 config: Config, 2356 decoder: Decoder = JsonDecoder(parameters={}), 2357 query_properties_key: Optional[str] = None, 2358 use_cache: Optional[bool] = None, 2359 *, 2360 name: str, 2361 ) -> HttpRequester: 2362 authenticator = ( 2363 self._create_component_from_model( 2364 model=model.authenticator, 2365 config=config, 2366 url_base=model.url or model.url_base, 2367 name=name, 2368 decoder=decoder, 2369 ) 2370 if model.authenticator 2371 else None 2372 ) 2373 error_handler = ( 2374 self._create_component_from_model(model=model.error_handler, config=config) 2375 if model.error_handler 2376 else DefaultErrorHandler( 2377 backoff_strategies=[], 2378 response_filters=[], 2379 config=config, 2380 parameters=model.parameters or {}, 2381 ) 2382 ) 2383 2384 api_budget = self._api_budget 2385 2386 # Removes QueryProperties components from the interpolated mappings because it has been designed 2387 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2388 # instead of through jinja interpolation 2389 request_parameters: Optional[Union[str, Mapping[str, str]]] 2390 if isinstance(model.request_parameters, Mapping): 2391 request_parameters = self._remove_query_properties(model.request_parameters) 2392 else: 2393 request_parameters = model.request_parameters 2394 2395 request_options_provider = InterpolatedRequestOptionsProvider( 2396 request_body=model.request_body, 2397 request_body_data=model.request_body_data, 2398 request_body_json=model.request_body_json, 2399 request_headers=model.request_headers, 2400 request_parameters=request_parameters, 2401 query_properties_key=query_properties_key, 2402 config=config, 2403 parameters=model.parameters or {}, 2404 ) 2405 2406 assert model.use_cache is not None # for mypy 2407 assert model.http_method is not None # for mypy 2408 2409 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2410 2411 return HttpRequester( 2412 name=name, 2413 url=model.url, 2414 url_base=model.url_base, 2415 path=model.path, 2416 authenticator=authenticator, 2417 error_handler=error_handler, 2418 api_budget=api_budget, 2419 http_method=HttpMethod[model.http_method.value], 2420 request_options_provider=request_options_provider, 2421 config=config, 2422 disable_retries=self._disable_retries, 2423 parameters=model.parameters or {}, 2424 message_repository=self._message_repository, 2425 use_cache=should_use_cache, 2426 decoder=decoder, 2427 stream_response=decoder.is_stream_response() if decoder else False, 2428 ) 2429 2430 @staticmethod 2431 def create_http_response_filter( 2432 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2433 ) -> HttpResponseFilter: 2434 if model.action: 2435 action = ResponseAction(model.action.value) 2436 else: 2437 action = None 2438 2439 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2440 2441 http_codes = ( 2442 set(model.http_codes) if model.http_codes else set() 2443 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2444 2445 return HttpResponseFilter( 2446 action=action, 2447 failure_type=failure_type, 2448 error_message=model.error_message or "", 2449 error_message_contains=model.error_message_contains or "", 2450 http_codes=http_codes, 2451 predicate=model.predicate or "", 2452 config=config, 2453 parameters=model.parameters or {}, 2454 ) 2455 2456 @staticmethod 2457 def create_inline_schema_loader( 2458 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2459 ) -> InlineSchemaLoader: 2460 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2461 2462 def create_complex_field_type( 2463 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2464 ) -> ComplexFieldType: 2465 items = ( 2466 self._create_component_from_model(model=model.items, config=config) 2467 if isinstance(model.items, ComplexFieldTypeModel) 2468 else model.items 2469 ) 2470 2471 return ComplexFieldType(field_type=model.field_type, items=items) 2472 2473 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2474 target_type = ( 2475 self._create_component_from_model(model=model.target_type, config=config) 2476 if isinstance(model.target_type, ComplexFieldTypeModel) 2477 else model.target_type 2478 ) 2479 2480 return TypesMap( 2481 target_type=target_type, 2482 current_type=model.current_type, 2483 condition=model.condition if model.condition is not None else "True", 2484 ) 2485 2486 def create_schema_type_identifier( 2487 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2488 ) -> SchemaTypeIdentifier: 2489 types_mapping = [] 2490 if model.types_mapping: 2491 types_mapping.extend( 2492 [ 2493 self._create_component_from_model(types_map, config=config) 2494 for types_map in model.types_mapping 2495 ] 2496 ) 2497 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2498 [x for x in model.schema_pointer] if model.schema_pointer else [] 2499 ) 2500 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2501 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2502 [x for x in model.type_pointer] if model.type_pointer else None 2503 ) 2504 2505 return SchemaTypeIdentifier( 2506 schema_pointer=model_schema_pointer, 2507 key_pointer=model_key_pointer, 2508 type_pointer=model_type_pointer, 2509 types_mapping=types_mapping, 2510 parameters=model.parameters or {}, 2511 ) 2512 2513 def create_dynamic_schema_loader( 2514 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2515 ) -> DynamicSchemaLoader: 2516 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2517 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2518 2519 schema_transformations = [] 2520 if model.schema_transformations: 2521 for transformation_model in model.schema_transformations: 2522 schema_transformations.append( 2523 self._create_component_from_model(model=transformation_model, config=config) 2524 ) 2525 name = "dynamic_properties" 2526 retriever = self._create_component_from_model( 2527 model=model.retriever, 2528 config=config, 2529 name=name, 2530 primary_key=None, 2531 stream_slicer=combined_slicers, 2532 transformations=[], 2533 use_cache=True, 2534 log_formatter=( 2535 lambda response: format_http_message( 2536 response, 2537 f"Schema loader '{name}' request", 2538 f"Request performed in order to extract schema.", 2539 name, 2540 is_auxiliary=True, 2541 ) 2542 ), 2543 ) 2544 schema_type_identifier = self._create_component_from_model( 2545 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2546 ) 2547 schema_filter = ( 2548 self._create_component_from_model( 2549 model.schema_filter, config=config, parameters=model.parameters or {} 2550 ) 2551 if model.schema_filter is not None 2552 else None 2553 ) 2554 2555 return DynamicSchemaLoader( 2556 retriever=retriever, 2557 config=config, 2558 schema_transformations=schema_transformations, 2559 schema_filter=schema_filter, 2560 schema_type_identifier=schema_type_identifier, 2561 parameters=model.parameters or {}, 2562 ) 2563 2564 @staticmethod 2565 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2566 return JsonDecoder(parameters={}) 2567 2568 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2569 return CompositeRawDecoder( 2570 parser=ModelToComponentFactory._get_parser(model, config), 2571 stream_response=False if self._emit_connector_builder_messages else True, 2572 ) 2573 2574 def create_jsonl_decoder( 2575 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2576 ) -> Decoder: 2577 return CompositeRawDecoder( 2578 parser=ModelToComponentFactory._get_parser(model, config), 2579 stream_response=False if self._emit_connector_builder_messages else True, 2580 ) 2581 2582 def create_gzip_decoder( 2583 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2584 ) -> Decoder: 2585 _compressed_response_types = { 2586 "gzip", 2587 "x-gzip", 2588 "gzip, deflate", 2589 "x-gzip, deflate", 2590 "application/zip", 2591 "application/gzip", 2592 "application/x-gzip", 2593 "application/x-zip-compressed", 2594 } 2595 2596 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2597 2598 if self._emit_connector_builder_messages: 2599 # This is very surprising but if the response is not streamed, 2600 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2601 # which uses urllib3 directly and does not uncompress the data. 2602 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2603 2604 return CompositeRawDecoder.by_headers( 2605 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2606 stream_response=True, 2607 fallback_parser=gzip_parser.inner_parser, 2608 ) 2609 2610 @staticmethod 2611 def create_incrementing_count_cursor( 2612 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2613 ) -> DatetimeBasedCursor: 2614 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2615 # we still parse models into components. The issue is that there's no runtime implementation of a 2616 # IncrementingCountCursor. 2617 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2618 return DatetimeBasedCursor( 2619 cursor_field=model.cursor_field, 2620 datetime_format="%Y-%m-%d", 2621 start_datetime="2024-12-12", 2622 config=config, 2623 parameters={}, 2624 ) 2625 2626 @staticmethod 2627 def create_iterable_decoder( 2628 model: IterableDecoderModel, config: Config, **kwargs: Any 2629 ) -> IterableDecoder: 2630 return IterableDecoder(parameters={}) 2631 2632 @staticmethod 2633 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2634 return XmlDecoder(parameters={}) 2635 2636 def create_zipfile_decoder( 2637 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2638 ) -> ZipfileDecoder: 2639 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2640 2641 @staticmethod 2642 def _get_parser(model: BaseModel, config: Config) -> Parser: 2643 if isinstance(model, JsonDecoderModel): 2644 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2645 return JsonParser() 2646 elif isinstance(model, JsonlDecoderModel): 2647 return JsonLineParser() 2648 elif isinstance(model, CsvDecoderModel): 2649 return CsvParser(encoding=model.encoding, delimiter=model.delimiter) 2650 elif isinstance(model, GzipDecoderModel): 2651 return GzipParser( 2652 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2653 ) 2654 elif isinstance( 2655 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2656 ): 2657 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2658 2659 raise ValueError(f"Unknown decoder type {model}") 2660 2661 @staticmethod 2662 def create_json_file_schema_loader( 2663 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2664 ) -> JsonFileSchemaLoader: 2665 return JsonFileSchemaLoader( 2666 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2667 ) 2668 2669 @staticmethod 2670 def create_jwt_authenticator( 2671 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2672 ) -> JwtAuthenticator: 2673 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2674 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2675 return JwtAuthenticator( 2676 config=config, 2677 parameters=model.parameters or {}, 2678 algorithm=JwtAlgorithm(model.algorithm.value), 2679 secret_key=model.secret_key, 2680 base64_encode_secret_key=model.base64_encode_secret_key, 2681 token_duration=model.token_duration, 2682 header_prefix=model.header_prefix, 2683 kid=jwt_headers.kid, 2684 typ=jwt_headers.typ, 2685 cty=jwt_headers.cty, 2686 iss=jwt_payload.iss, 2687 sub=jwt_payload.sub, 2688 aud=jwt_payload.aud, 2689 additional_jwt_headers=model.additional_jwt_headers, 2690 additional_jwt_payload=model.additional_jwt_payload, 2691 ) 2692 2693 def create_list_partition_router( 2694 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2695 ) -> ListPartitionRouter: 2696 request_option = ( 2697 self._create_component_from_model(model.request_option, config) 2698 if model.request_option 2699 else None 2700 ) 2701 return ListPartitionRouter( 2702 cursor_field=model.cursor_field, 2703 request_option=request_option, 2704 values=model.values, 2705 config=config, 2706 parameters=model.parameters or {}, 2707 ) 2708 2709 @staticmethod 2710 def create_min_max_datetime( 2711 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2712 ) -> MinMaxDatetime: 2713 return MinMaxDatetime( 2714 datetime=model.datetime, 2715 datetime_format=model.datetime_format or "", 2716 max_datetime=model.max_datetime or "", 2717 min_datetime=model.min_datetime or "", 2718 parameters=model.parameters or {}, 2719 ) 2720 2721 @staticmethod 2722 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2723 return NoAuth(parameters=model.parameters or {}) 2724 2725 @staticmethod 2726 def create_no_pagination( 2727 model: NoPaginationModel, config: Config, **kwargs: Any 2728 ) -> NoPagination: 2729 return NoPagination(parameters={}) 2730 2731 def create_oauth_authenticator( 2732 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2733 ) -> DeclarativeOauth2Authenticator: 2734 profile_assertion = ( 2735 self._create_component_from_model(model.profile_assertion, config=config) 2736 if model.profile_assertion 2737 else None 2738 ) 2739 2740 if model.refresh_token_updater: 2741 # ignore type error because fixing it would have a lot of dependencies, revisit later 2742 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2743 config, 2744 InterpolatedString.create( 2745 model.token_refresh_endpoint, # type: ignore 2746 parameters=model.parameters or {}, 2747 ).eval(config), 2748 access_token_name=InterpolatedString.create( 2749 model.access_token_name or "access_token", parameters=model.parameters or {} 2750 ).eval(config), 2751 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2752 expires_in_name=InterpolatedString.create( 2753 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2754 ).eval(config), 2755 client_id_name=InterpolatedString.create( 2756 model.client_id_name or "client_id", parameters=model.parameters or {} 2757 ).eval(config), 2758 client_id=InterpolatedString.create( 2759 model.client_id, parameters=model.parameters or {} 2760 ).eval(config) 2761 if model.client_id 2762 else model.client_id, 2763 client_secret_name=InterpolatedString.create( 2764 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2765 ).eval(config), 2766 client_secret=InterpolatedString.create( 2767 model.client_secret, parameters=model.parameters or {} 2768 ).eval(config) 2769 if model.client_secret 2770 else model.client_secret, 2771 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2772 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2773 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2774 grant_type_name=InterpolatedString.create( 2775 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2776 ).eval(config), 2777 grant_type=InterpolatedString.create( 2778 model.grant_type or "refresh_token", parameters=model.parameters or {} 2779 ).eval(config), 2780 refresh_request_body=InterpolatedMapping( 2781 model.refresh_request_body or {}, parameters=model.parameters or {} 2782 ).eval(config), 2783 refresh_request_headers=InterpolatedMapping( 2784 model.refresh_request_headers or {}, parameters=model.parameters or {} 2785 ).eval(config), 2786 scopes=model.scopes, 2787 token_expiry_date_format=model.token_expiry_date_format, 2788 message_repository=self._message_repository, 2789 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2790 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2791 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2792 ) 2793 # ignore type error because fixing it would have a lot of dependencies, revisit later 2794 return DeclarativeOauth2Authenticator( # type: ignore 2795 access_token_name=model.access_token_name or "access_token", 2796 access_token_value=model.access_token_value, 2797 client_id_name=model.client_id_name or "client_id", 2798 client_id=model.client_id, 2799 client_secret_name=model.client_secret_name or "client_secret", 2800 client_secret=model.client_secret, 2801 expires_in_name=model.expires_in_name or "expires_in", 2802 grant_type_name=model.grant_type_name or "grant_type", 2803 grant_type=model.grant_type or "refresh_token", 2804 refresh_request_body=model.refresh_request_body, 2805 refresh_request_headers=model.refresh_request_headers, 2806 refresh_token_name=model.refresh_token_name or "refresh_token", 2807 refresh_token=model.refresh_token, 2808 scopes=model.scopes, 2809 token_expiry_date=model.token_expiry_date, 2810 token_expiry_date_format=model.token_expiry_date_format, 2811 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2812 token_refresh_endpoint=model.token_refresh_endpoint, 2813 config=config, 2814 parameters=model.parameters or {}, 2815 message_repository=self._message_repository, 2816 profile_assertion=profile_assertion, 2817 use_profile_assertion=model.use_profile_assertion, 2818 ) 2819 2820 def create_offset_increment( 2821 self, 2822 model: OffsetIncrementModel, 2823 config: Config, 2824 decoder: Decoder, 2825 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2826 **kwargs: Any, 2827 ) -> OffsetIncrement: 2828 if isinstance(decoder, PaginationDecoderDecorator): 2829 inner_decoder = decoder.decoder 2830 else: 2831 inner_decoder = decoder 2832 decoder = PaginationDecoderDecorator(decoder=decoder) 2833 2834 if self._is_supported_decoder_for_pagination(inner_decoder): 2835 decoder_to_use = decoder 2836 else: 2837 raise ValueError( 2838 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2839 ) 2840 2841 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2842 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2843 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2844 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2845 # When we have more time to investigate we can look into reusing the same component. 2846 extractor = ( 2847 self._create_component_from_model( 2848 model=extractor_model, config=config, decoder=decoder_to_use 2849 ) 2850 if extractor_model 2851 else None 2852 ) 2853 2854 return OffsetIncrement( 2855 page_size=model.page_size, 2856 config=config, 2857 decoder=decoder_to_use, 2858 extractor=extractor, 2859 inject_on_first_request=model.inject_on_first_request or False, 2860 parameters=model.parameters or {}, 2861 ) 2862 2863 @staticmethod 2864 def create_page_increment( 2865 model: PageIncrementModel, config: Config, **kwargs: Any 2866 ) -> PageIncrement: 2867 return PageIncrement( 2868 page_size=model.page_size, 2869 config=config, 2870 start_from_page=model.start_from_page or 0, 2871 inject_on_first_request=model.inject_on_first_request or False, 2872 parameters=model.parameters or {}, 2873 ) 2874 2875 def create_parent_stream_config( 2876 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2877 ) -> ParentStreamConfig: 2878 declarative_stream = self._create_component_from_model( 2879 model.stream, config=config, **kwargs 2880 ) 2881 request_option = ( 2882 self._create_component_from_model(model.request_option, config=config) 2883 if model.request_option 2884 else None 2885 ) 2886 2887 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2888 raise ValueError( 2889 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2890 ) 2891 2892 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2893 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2894 ) 2895 2896 return ParentStreamConfig( 2897 parent_key=model.parent_key, 2898 request_option=request_option, 2899 stream=declarative_stream, 2900 partition_field=model.partition_field, 2901 config=config, 2902 incremental_dependency=model.incremental_dependency or False, 2903 parameters=model.parameters or {}, 2904 extra_fields=model.extra_fields, 2905 lazy_read_pointer=model_lazy_read_pointer, 2906 ) 2907 2908 def create_properties_from_endpoint( 2909 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2910 ) -> PropertiesFromEndpoint: 2911 retriever = self._create_component_from_model( 2912 model=model.retriever, 2913 config=config, 2914 name="dynamic_properties", 2915 primary_key=None, 2916 stream_slicer=None, 2917 transformations=[], 2918 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2919 ) 2920 return PropertiesFromEndpoint( 2921 property_field_path=model.property_field_path, 2922 retriever=retriever, 2923 config=config, 2924 parameters=model.parameters or {}, 2925 ) 2926 2927 def create_property_chunking( 2928 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2929 ) -> PropertyChunking: 2930 record_merge_strategy = ( 2931 self._create_component_from_model( 2932 model=model.record_merge_strategy, config=config, **kwargs 2933 ) 2934 if model.record_merge_strategy 2935 else None 2936 ) 2937 2938 property_limit_type: PropertyLimitType 2939 match model.property_limit_type: 2940 case PropertyLimitTypeModel.property_count: 2941 property_limit_type = PropertyLimitType.property_count 2942 case PropertyLimitTypeModel.characters: 2943 property_limit_type = PropertyLimitType.characters 2944 case _: 2945 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2946 2947 return PropertyChunking( 2948 property_limit_type=property_limit_type, 2949 property_limit=model.property_limit, 2950 record_merge_strategy=record_merge_strategy, 2951 config=config, 2952 parameters=model.parameters or {}, 2953 ) 2954 2955 def create_query_properties( 2956 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2957 ) -> QueryProperties: 2958 if isinstance(model.property_list, list): 2959 property_list = model.property_list 2960 else: 2961 property_list = self._create_component_from_model( 2962 model=model.property_list, config=config, **kwargs 2963 ) 2964 2965 property_chunking = ( 2966 self._create_component_from_model( 2967 model=model.property_chunking, config=config, **kwargs 2968 ) 2969 if model.property_chunking 2970 else None 2971 ) 2972 2973 return QueryProperties( 2974 property_list=property_list, 2975 always_include_properties=model.always_include_properties, 2976 property_chunking=property_chunking, 2977 config=config, 2978 parameters=model.parameters or {}, 2979 ) 2980 2981 @staticmethod 2982 def create_record_filter( 2983 model: RecordFilterModel, config: Config, **kwargs: Any 2984 ) -> RecordFilter: 2985 return RecordFilter( 2986 condition=model.condition or "", config=config, parameters=model.parameters or {} 2987 ) 2988 2989 @staticmethod 2990 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 2991 return RequestPath(parameters={}) 2992 2993 @staticmethod 2994 def create_request_option( 2995 model: RequestOptionModel, config: Config, **kwargs: Any 2996 ) -> RequestOption: 2997 inject_into = RequestOptionType(model.inject_into.value) 2998 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 2999 [ 3000 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3001 for segment in model.field_path 3002 ] 3003 if model.field_path 3004 else None 3005 ) 3006 field_name = ( 3007 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3008 if model.field_name 3009 else None 3010 ) 3011 return RequestOption( 3012 field_name=field_name, 3013 field_path=field_path, 3014 inject_into=inject_into, 3015 parameters=kwargs.get("parameters", {}), 3016 ) 3017 3018 def create_record_selector( 3019 self, 3020 model: RecordSelectorModel, 3021 config: Config, 3022 *, 3023 name: str, 3024 transformations: List[RecordTransformation] | None = None, 3025 decoder: Decoder | None = None, 3026 client_side_incremental_sync: Dict[str, Any] | None = None, 3027 file_uploader: Optional[DefaultFileUploader] = None, 3028 **kwargs: Any, 3029 ) -> RecordSelector: 3030 extractor = self._create_component_from_model( 3031 model=model.extractor, decoder=decoder, config=config 3032 ) 3033 record_filter = ( 3034 self._create_component_from_model(model.record_filter, config=config) 3035 if model.record_filter 3036 else None 3037 ) 3038 3039 transform_before_filtering = ( 3040 False if model.transform_before_filtering is None else model.transform_before_filtering 3041 ) 3042 if client_side_incremental_sync: 3043 record_filter = ClientSideIncrementalRecordFilterDecorator( 3044 config=config, 3045 parameters=model.parameters, 3046 condition=model.record_filter.condition 3047 if (model.record_filter and hasattr(model.record_filter, "condition")) 3048 else None, 3049 **client_side_incremental_sync, 3050 ) 3051 transform_before_filtering = ( 3052 True 3053 if model.transform_before_filtering is None 3054 else model.transform_before_filtering 3055 ) 3056 3057 if model.schema_normalization is None: 3058 # default to no schema normalization if not set 3059 model.schema_normalization = SchemaNormalizationModel.None_ 3060 3061 schema_normalization = ( 3062 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3063 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3064 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3065 ) 3066 3067 return RecordSelector( 3068 extractor=extractor, 3069 name=name, 3070 config=config, 3071 record_filter=record_filter, 3072 transformations=transformations or [], 3073 file_uploader=file_uploader, 3074 schema_normalization=schema_normalization, 3075 parameters=model.parameters or {}, 3076 transform_before_filtering=transform_before_filtering, 3077 ) 3078 3079 @staticmethod 3080 def create_remove_fields( 3081 model: RemoveFieldsModel, config: Config, **kwargs: Any 3082 ) -> RemoveFields: 3083 return RemoveFields( 3084 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3085 ) 3086 3087 def create_selective_authenticator( 3088 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3089 ) -> DeclarativeAuthenticator: 3090 authenticators = { 3091 name: self._create_component_from_model(model=auth, config=config) 3092 for name, auth in model.authenticators.items() 3093 } 3094 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3095 return SelectiveAuthenticator( # type: ignore[abstract] 3096 config=config, 3097 authenticators=authenticators, 3098 authenticator_selection_path=model.authenticator_selection_path, 3099 **kwargs, 3100 ) 3101 3102 @staticmethod 3103 def create_legacy_session_token_authenticator( 3104 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3105 ) -> LegacySessionTokenAuthenticator: 3106 return LegacySessionTokenAuthenticator( 3107 api_url=url_base, 3108 header=model.header, 3109 login_url=model.login_url, 3110 password=model.password or "", 3111 session_token=model.session_token or "", 3112 session_token_response_key=model.session_token_response_key or "", 3113 username=model.username or "", 3114 validate_session_url=model.validate_session_url, 3115 config=config, 3116 parameters=model.parameters or {}, 3117 ) 3118 3119 def create_simple_retriever( 3120 self, 3121 model: SimpleRetrieverModel, 3122 config: Config, 3123 *, 3124 name: str, 3125 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3126 stream_slicer: Optional[StreamSlicer], 3127 request_options_provider: Optional[RequestOptionsProvider] = None, 3128 stop_condition_on_cursor: bool = False, 3129 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3130 transformations: List[RecordTransformation], 3131 file_uploader: Optional[DefaultFileUploader] = None, 3132 incremental_sync: Optional[ 3133 Union[ 3134 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 3135 ] 3136 ] = None, 3137 use_cache: Optional[bool] = None, 3138 log_formatter: Optional[Callable[[Response], Any]] = None, 3139 **kwargs: Any, 3140 ) -> SimpleRetriever: 3141 def _get_url() -> str: 3142 """ 3143 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3144 This is needed because the URL is not set until the requester is created. 3145 """ 3146 3147 _url = ( 3148 model.requester.url 3149 if hasattr(model.requester, "url") and model.requester.url is not None 3150 else requester.get_url() 3151 ) 3152 _url_base = ( 3153 model.requester.url_base 3154 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3155 else requester.get_url_base() 3156 ) 3157 3158 return _url or _url_base 3159 3160 decoder = ( 3161 self._create_component_from_model(model=model.decoder, config=config) 3162 if model.decoder 3163 else JsonDecoder(parameters={}) 3164 ) 3165 record_selector = self._create_component_from_model( 3166 model=model.record_selector, 3167 name=name, 3168 config=config, 3169 decoder=decoder, 3170 transformations=transformations, 3171 client_side_incremental_sync=client_side_incremental_sync, 3172 file_uploader=file_uploader, 3173 ) 3174 3175 query_properties: Optional[QueryProperties] = None 3176 query_properties_key: Optional[str] = None 3177 if self._query_properties_in_request_parameters(model.requester): 3178 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3179 # places instead of default to request_parameters which isn't clearly documented 3180 if ( 3181 hasattr(model.requester, "fetch_properties_from_endpoint") 3182 and model.requester.fetch_properties_from_endpoint 3183 ): 3184 raise ValueError( 3185 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3186 ) 3187 3188 query_properties_definitions = [] 3189 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3190 if isinstance(request_parameter, QueryPropertiesModel): 3191 query_properties_key = key 3192 query_properties_definitions.append(request_parameter) 3193 3194 if len(query_properties_definitions) > 1: 3195 raise ValueError( 3196 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3197 ) 3198 3199 if len(query_properties_definitions) == 1: 3200 query_properties = self._create_component_from_model( 3201 model=query_properties_definitions[0], config=config 3202 ) 3203 elif ( 3204 hasattr(model.requester, "fetch_properties_from_endpoint") 3205 and model.requester.fetch_properties_from_endpoint 3206 ): 3207 query_properties_definition = QueryPropertiesModel( 3208 type="QueryProperties", 3209 property_list=model.requester.fetch_properties_from_endpoint, 3210 always_include_properties=None, 3211 property_chunking=None, 3212 ) # type: ignore # $parameters has a default value 3213 3214 query_properties = self.create_query_properties( 3215 model=query_properties_definition, 3216 config=config, 3217 ) 3218 3219 requester = self._create_component_from_model( 3220 model=model.requester, 3221 decoder=decoder, 3222 name=name, 3223 query_properties_key=query_properties_key, 3224 use_cache=use_cache, 3225 config=config, 3226 ) 3227 3228 # Define cursor only if per partition or common incremental support is needed 3229 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3230 3231 if ( 3232 not isinstance(stream_slicer, DatetimeBasedCursor) 3233 or type(stream_slicer) is not DatetimeBasedCursor 3234 ): 3235 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3236 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3237 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3238 # request_options_provider 3239 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3240 elif not request_options_provider: 3241 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3242 3243 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3244 3245 cursor_used_for_stop_condition = cursor if stop_condition_on_cursor else None 3246 paginator = ( 3247 self._create_component_from_model( 3248 model=model.paginator, 3249 config=config, 3250 url_base=_get_url(), 3251 extractor_model=model.record_selector.extractor, 3252 decoder=decoder, 3253 cursor_used_for_stop_condition=cursor_used_for_stop_condition, 3254 ) 3255 if model.paginator 3256 else NoPagination(parameters={}) 3257 ) 3258 3259 ignore_stream_slicer_parameters_on_paginated_requests = ( 3260 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3261 ) 3262 3263 if ( 3264 model.partition_router 3265 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3266 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3267 and any( 3268 parent_stream_config.lazy_read_pointer 3269 for parent_stream_config in model.partition_router.parent_stream_configs 3270 ) 3271 ): 3272 if incremental_sync: 3273 if incremental_sync.type != "DatetimeBasedCursor": 3274 raise ValueError( 3275 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3276 ) 3277 3278 elif incremental_sync.step or incremental_sync.cursor_granularity: 3279 raise ValueError( 3280 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3281 ) 3282 3283 if model.decoder and model.decoder.type != "JsonDecoder": 3284 raise ValueError( 3285 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3286 ) 3287 3288 return LazySimpleRetriever( 3289 name=name, 3290 paginator=paginator, 3291 primary_key=primary_key, 3292 requester=requester, 3293 record_selector=record_selector, 3294 stream_slicer=stream_slicer, 3295 request_option_provider=request_options_provider, 3296 cursor=cursor, 3297 config=config, 3298 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3299 parameters=model.parameters or {}, 3300 ) 3301 3302 if self._limit_slices_fetched or self._emit_connector_builder_messages: 3303 return SimpleRetrieverTestReadDecorator( 3304 name=name, 3305 paginator=paginator, 3306 primary_key=primary_key, 3307 requester=requester, 3308 record_selector=record_selector, 3309 stream_slicer=stream_slicer, 3310 request_option_provider=request_options_provider, 3311 cursor=cursor, 3312 config=config, 3313 maximum_number_of_slices=self._limit_slices_fetched or 5, 3314 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3315 log_formatter=log_formatter, 3316 parameters=model.parameters or {}, 3317 ) 3318 return SimpleRetriever( 3319 name=name, 3320 paginator=paginator, 3321 primary_key=primary_key, 3322 requester=requester, 3323 record_selector=record_selector, 3324 stream_slicer=stream_slicer, 3325 request_option_provider=request_options_provider, 3326 cursor=cursor, 3327 config=config, 3328 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3329 additional_query_properties=query_properties, 3330 parameters=model.parameters or {}, 3331 ) 3332 3333 @staticmethod 3334 def _query_properties_in_request_parameters( 3335 requester: Union[HttpRequesterModel, CustomRequesterModel], 3336 ) -> bool: 3337 if not hasattr(requester, "request_parameters"): 3338 return False 3339 request_parameters = requester.request_parameters 3340 if request_parameters and isinstance(request_parameters, Mapping): 3341 for request_parameter in request_parameters.values(): 3342 if isinstance(request_parameter, QueryPropertiesModel): 3343 return True 3344 return False 3345 3346 @staticmethod 3347 def _remove_query_properties( 3348 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3349 ) -> Mapping[str, str]: 3350 return { 3351 parameter_field: request_parameter 3352 for parameter_field, request_parameter in request_parameters.items() 3353 if not isinstance(request_parameter, QueryPropertiesModel) 3354 } 3355 3356 def create_state_delegating_stream( 3357 self, 3358 model: StateDelegatingStreamModel, 3359 config: Config, 3360 has_parent_state: Optional[bool] = None, 3361 **kwargs: Any, 3362 ) -> DeclarativeStream: 3363 if ( 3364 model.full_refresh_stream.name != model.name 3365 or model.name != model.incremental_stream.name 3366 ): 3367 raise ValueError( 3368 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3369 ) 3370 3371 stream_model = ( 3372 model.incremental_stream 3373 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3374 else model.full_refresh_stream 3375 ) 3376 3377 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description 3378 3379 def _create_async_job_status_mapping( 3380 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3381 ) -> Mapping[str, AsyncJobStatus]: 3382 api_status_to_cdk_status = {} 3383 for cdk_status, api_statuses in model.dict().items(): 3384 if cdk_status == "type": 3385 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3386 continue 3387 3388 for status in api_statuses: 3389 if status in api_status_to_cdk_status: 3390 raise ValueError( 3391 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3392 ) 3393 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3394 return api_status_to_cdk_status 3395 3396 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3397 match status: 3398 case "running": 3399 return AsyncJobStatus.RUNNING 3400 case "completed": 3401 return AsyncJobStatus.COMPLETED 3402 case "failed": 3403 return AsyncJobStatus.FAILED 3404 case "timeout": 3405 return AsyncJobStatus.TIMED_OUT 3406 case _: 3407 raise ValueError(f"Unsupported CDK status {status}") 3408 3409 def create_async_retriever( 3410 self, 3411 model: AsyncRetrieverModel, 3412 config: Config, 3413 *, 3414 name: str, 3415 primary_key: Optional[ 3416 Union[str, List[str], List[List[str]]] 3417 ], # this seems to be needed to match create_simple_retriever 3418 stream_slicer: Optional[StreamSlicer], 3419 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3420 transformations: List[RecordTransformation], 3421 **kwargs: Any, 3422 ) -> AsyncRetriever: 3423 def _get_download_retriever() -> SimpleRetrieverTestReadDecorator | SimpleRetriever: 3424 record_selector = RecordSelector( 3425 extractor=download_extractor, 3426 name=name, 3427 record_filter=None, 3428 transformations=transformations, 3429 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3430 config=config, 3431 parameters={}, 3432 ) 3433 paginator = ( 3434 self._create_component_from_model( 3435 model=model.download_paginator, 3436 decoder=decoder, 3437 config=config, 3438 url_base="", 3439 ) 3440 if model.download_paginator 3441 else NoPagination(parameters={}) 3442 ) 3443 maximum_number_of_slices = self._limit_slices_fetched or 5 3444 3445 if self._limit_slices_fetched or self._emit_connector_builder_messages: 3446 return SimpleRetrieverTestReadDecorator( 3447 requester=download_requester, 3448 record_selector=record_selector, 3449 primary_key=None, 3450 name=job_download_components_name, 3451 paginator=paginator, 3452 config=config, 3453 parameters={}, 3454 maximum_number_of_slices=maximum_number_of_slices, 3455 ) 3456 3457 return SimpleRetriever( 3458 requester=download_requester, 3459 record_selector=record_selector, 3460 primary_key=None, 3461 name=job_download_components_name, 3462 paginator=paginator, 3463 config=config, 3464 parameters={}, 3465 ) 3466 3467 def _get_job_timeout() -> datetime.timedelta: 3468 user_defined_timeout: Optional[int] = ( 3469 int( 3470 InterpolatedString.create( 3471 str(model.polling_job_timeout), 3472 parameters={}, 3473 ).eval(config) 3474 ) 3475 if model.polling_job_timeout 3476 else None 3477 ) 3478 3479 # check for user defined timeout during the test read or 15 minutes 3480 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3481 # default value for non-connector builder is 60 minutes. 3482 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3483 3484 return ( 3485 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3486 ) 3487 3488 decoder = ( 3489 self._create_component_from_model(model=model.decoder, config=config) 3490 if model.decoder 3491 else JsonDecoder(parameters={}) 3492 ) 3493 record_selector = self._create_component_from_model( 3494 model=model.record_selector, 3495 config=config, 3496 decoder=decoder, 3497 name=name, 3498 transformations=transformations, 3499 client_side_incremental_sync=client_side_incremental_sync, 3500 ) 3501 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3502 creation_requester = self._create_component_from_model( 3503 model=model.creation_requester, 3504 decoder=decoder, 3505 config=config, 3506 name=f"job creation - {name}", 3507 ) 3508 polling_requester = self._create_component_from_model( 3509 model=model.polling_requester, 3510 decoder=decoder, 3511 config=config, 3512 name=f"job polling - {name}", 3513 ) 3514 job_download_components_name = f"job download - {name}" 3515 download_decoder = ( 3516 self._create_component_from_model(model=model.download_decoder, config=config) 3517 if model.download_decoder 3518 else JsonDecoder(parameters={}) 3519 ) 3520 download_extractor = ( 3521 self._create_component_from_model( 3522 model=model.download_extractor, 3523 config=config, 3524 decoder=download_decoder, 3525 parameters=model.parameters, 3526 ) 3527 if model.download_extractor 3528 else DpathExtractor( 3529 [], 3530 config=config, 3531 decoder=download_decoder, 3532 parameters=model.parameters or {}, 3533 ) 3534 ) 3535 download_requester = self._create_component_from_model( 3536 model=model.download_requester, 3537 decoder=download_decoder, 3538 config=config, 3539 name=job_download_components_name, 3540 ) 3541 download_retriever = _get_download_retriever() 3542 abort_requester = ( 3543 self._create_component_from_model( 3544 model=model.abort_requester, 3545 decoder=decoder, 3546 config=config, 3547 name=f"job abort - {name}", 3548 ) 3549 if model.abort_requester 3550 else None 3551 ) 3552 delete_requester = ( 3553 self._create_component_from_model( 3554 model=model.delete_requester, 3555 decoder=decoder, 3556 config=config, 3557 name=f"job delete - {name}", 3558 ) 3559 if model.delete_requester 3560 else None 3561 ) 3562 download_target_requester = ( 3563 self._create_component_from_model( 3564 model=model.download_target_requester, 3565 decoder=decoder, 3566 config=config, 3567 name=f"job extract_url - {name}", 3568 ) 3569 if model.download_target_requester 3570 else None 3571 ) 3572 status_extractor = self._create_component_from_model( 3573 model=model.status_extractor, decoder=decoder, config=config, name=name 3574 ) 3575 download_target_extractor = self._create_component_from_model( 3576 model=model.download_target_extractor, 3577 decoder=decoder, 3578 config=config, 3579 name=name, 3580 ) 3581 3582 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3583 creation_requester=creation_requester, 3584 polling_requester=polling_requester, 3585 download_retriever=download_retriever, 3586 download_target_requester=download_target_requester, 3587 abort_requester=abort_requester, 3588 delete_requester=delete_requester, 3589 status_extractor=status_extractor, 3590 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3591 download_target_extractor=download_target_extractor, 3592 job_timeout=_get_job_timeout(), 3593 ) 3594 3595 async_job_partition_router = AsyncJobPartitionRouter( 3596 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3597 job_repository, 3598 stream_slices, 3599 self._job_tracker, 3600 self._message_repository, 3601 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3602 has_bulk_parent=False, 3603 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3604 # `None` == default retry is set to 3 attempts, under the hood. 3605 job_max_retry=1 if self._emit_connector_builder_messages else None, 3606 ), 3607 stream_slicer=stream_slicer, 3608 config=config, 3609 parameters=model.parameters or {}, 3610 ) 3611 3612 return AsyncRetriever( 3613 record_selector=record_selector, 3614 stream_slicer=async_job_partition_router, 3615 config=config, 3616 parameters=model.parameters or {}, 3617 ) 3618 3619 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3620 config_migrations = [ 3621 self._create_component_from_model(migration, config) 3622 for migration in ( 3623 model.config_normalization_rules.config_migrations 3624 if ( 3625 model.config_normalization_rules 3626 and model.config_normalization_rules.config_migrations 3627 ) 3628 else [] 3629 ) 3630 ] 3631 config_transformations = [ 3632 self._create_component_from_model(transformation, config) 3633 for transformation in ( 3634 model.config_normalization_rules.transformations 3635 if ( 3636 model.config_normalization_rules 3637 and model.config_normalization_rules.transformations 3638 ) 3639 else [] 3640 ) 3641 ] 3642 config_validations = [ 3643 self._create_component_from_model(validation, config) 3644 for validation in ( 3645 model.config_normalization_rules.validations 3646 if ( 3647 model.config_normalization_rules 3648 and model.config_normalization_rules.validations 3649 ) 3650 else [] 3651 ) 3652 ] 3653 3654 return Spec( 3655 connection_specification=model.connection_specification, 3656 documentation_url=model.documentation_url, 3657 advanced_auth=model.advanced_auth, 3658 parameters={}, 3659 config_migrations=config_migrations, 3660 config_transformations=config_transformations, 3661 config_validations=config_validations, 3662 ) 3663 3664 def create_substream_partition_router( 3665 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3666 ) -> SubstreamPartitionRouter: 3667 parent_stream_configs = [] 3668 if model.parent_stream_configs: 3669 parent_stream_configs.extend( 3670 [ 3671 self._create_message_repository_substream_wrapper( 3672 model=parent_stream_config, config=config, **kwargs 3673 ) 3674 for parent_stream_config in model.parent_stream_configs 3675 ] 3676 ) 3677 3678 return SubstreamPartitionRouter( 3679 parent_stream_configs=parent_stream_configs, 3680 parameters=model.parameters or {}, 3681 config=config, 3682 ) 3683 3684 def _create_message_repository_substream_wrapper( 3685 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 3686 ) -> Any: 3687 substream_factory = ModelToComponentFactory( 3688 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3689 limit_slices_fetched=self._limit_slices_fetched, 3690 emit_connector_builder_messages=self._emit_connector_builder_messages, 3691 disable_retries=self._disable_retries, 3692 disable_cache=self._disable_cache, 3693 message_repository=LogAppenderMessageRepositoryDecorator( 3694 {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}}, 3695 self._message_repository, 3696 self._evaluate_log_level(self._emit_connector_builder_messages), 3697 ), 3698 ) 3699 3700 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3701 has_parent_state = bool( 3702 self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None) 3703 if model.incremental_dependency 3704 else False 3705 ) 3706 return substream_factory._create_component_from_model( 3707 model=model, config=config, has_parent_state=has_parent_state, **kwargs 3708 ) 3709 3710 @staticmethod 3711 def create_wait_time_from_header( 3712 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3713 ) -> WaitTimeFromHeaderBackoffStrategy: 3714 return WaitTimeFromHeaderBackoffStrategy( 3715 header=model.header, 3716 parameters=model.parameters or {}, 3717 config=config, 3718 regex=model.regex, 3719 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3720 if model.max_waiting_time_in_seconds is not None 3721 else None, 3722 ) 3723 3724 @staticmethod 3725 def create_wait_until_time_from_header( 3726 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3727 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3728 return WaitUntilTimeFromHeaderBackoffStrategy( 3729 header=model.header, 3730 parameters=model.parameters or {}, 3731 config=config, 3732 min_wait=model.min_wait, 3733 regex=model.regex, 3734 ) 3735 3736 def get_message_repository(self) -> MessageRepository: 3737 return self._message_repository 3738 3739 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3740 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3741 3742 @staticmethod 3743 def create_components_mapping_definition( 3744 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3745 ) -> ComponentMappingDefinition: 3746 interpolated_value = InterpolatedString.create( 3747 model.value, parameters=model.parameters or {} 3748 ) 3749 field_path = [ 3750 InterpolatedString.create(path, parameters=model.parameters or {}) 3751 for path in model.field_path 3752 ] 3753 return ComponentMappingDefinition( 3754 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3755 value=interpolated_value, 3756 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3757 create_or_update=model.create_or_update, 3758 parameters=model.parameters or {}, 3759 ) 3760 3761 def create_http_components_resolver( 3762 self, model: HttpComponentsResolverModel, config: Config 3763 ) -> Any: 3764 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3765 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3766 3767 retriever = self._create_component_from_model( 3768 model=model.retriever, 3769 config=config, 3770 name="", 3771 primary_key=None, 3772 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3773 transformations=[], 3774 ) 3775 3776 components_mapping = [ 3777 self._create_component_from_model( 3778 model=components_mapping_definition_model, 3779 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3780 components_mapping_definition_model.value_type 3781 ), 3782 config=config, 3783 ) 3784 for components_mapping_definition_model in model.components_mapping 3785 ] 3786 3787 return HttpComponentsResolver( 3788 retriever=retriever, 3789 config=config, 3790 components_mapping=components_mapping, 3791 parameters=model.parameters or {}, 3792 ) 3793 3794 @staticmethod 3795 def create_stream_config( 3796 model: StreamConfigModel, config: Config, **kwargs: Any 3797 ) -> StreamConfig: 3798 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3799 [x for x in model.configs_pointer] if model.configs_pointer else [] 3800 ) 3801 3802 return StreamConfig( 3803 configs_pointer=model_configs_pointer, 3804 default_values=model.default_values, 3805 parameters=model.parameters or {}, 3806 ) 3807 3808 def create_config_components_resolver( 3809 self, model: ConfigComponentsResolverModel, config: Config 3810 ) -> Any: 3811 model_stream_configs = ( 3812 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3813 ) 3814 3815 stream_configs = [ 3816 self._create_component_from_model( 3817 stream_config, config=config, parameters=model.parameters or {} 3818 ) 3819 for stream_config in model_stream_configs 3820 ] 3821 3822 components_mapping = [ 3823 self._create_component_from_model( 3824 model=components_mapping_definition_model, 3825 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3826 components_mapping_definition_model.value_type 3827 ), 3828 config=config, 3829 ) 3830 for components_mapping_definition_model in model.components_mapping 3831 ] 3832 3833 return ConfigComponentsResolver( 3834 stream_configs=stream_configs, 3835 config=config, 3836 components_mapping=components_mapping, 3837 parameters=model.parameters or {}, 3838 ) 3839 3840 _UNSUPPORTED_DECODER_ERROR = ( 3841 "Specified decoder of {decoder_type} is not supported for pagination." 3842 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 3843 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 3844 ) 3845 3846 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 3847 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 3848 return True 3849 elif isinstance(decoder, CompositeRawDecoder): 3850 return self._is_supported_parser_for_pagination(decoder.parser) 3851 else: 3852 return False 3853 3854 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 3855 if isinstance(parser, JsonParser): 3856 return True 3857 elif isinstance(parser, GzipParser): 3858 return isinstance(parser.inner_parser, JsonParser) 3859 else: 3860 return False 3861 3862 def create_http_api_budget( 3863 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3864 ) -> HttpAPIBudget: 3865 policies = [ 3866 self._create_component_from_model(model=policy, config=config) 3867 for policy in model.policies 3868 ] 3869 3870 return HttpAPIBudget( 3871 policies=policies, 3872 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 3873 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 3874 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 3875 ) 3876 3877 def create_fixed_window_call_rate_policy( 3878 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 3879 ) -> FixedWindowCallRatePolicy: 3880 matchers = [ 3881 self._create_component_from_model(model=matcher, config=config) 3882 for matcher in model.matchers 3883 ] 3884 3885 # Set the initial reset timestamp to 10 days from now. 3886 # This value will be updated by the first request. 3887 return FixedWindowCallRatePolicy( 3888 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 3889 period=parse_duration(model.period), 3890 call_limit=model.call_limit, 3891 matchers=matchers, 3892 ) 3893 3894 def create_file_uploader( 3895 self, model: FileUploaderModel, config: Config, **kwargs: Any 3896 ) -> FileUploader: 3897 name = "File Uploader" 3898 requester = self._create_component_from_model( 3899 model=model.requester, 3900 config=config, 3901 name=name, 3902 **kwargs, 3903 ) 3904 download_target_extractor = self._create_component_from_model( 3905 model=model.download_target_extractor, 3906 config=config, 3907 name=name, 3908 **kwargs, 3909 ) 3910 emit_connector_builder_messages = self._emit_connector_builder_messages 3911 file_uploader = DefaultFileUploader( 3912 requester=requester, 3913 download_target_extractor=download_target_extractor, 3914 config=config, 3915 file_writer=NoopFileWriter() 3916 if emit_connector_builder_messages 3917 else LocalFileSystemFileWriter(), 3918 parameters=model.parameters or {}, 3919 filename_extractor=model.filename_extractor if model.filename_extractor else None, 3920 ) 3921 3922 return ( 3923 ConnectorBuilderFileUploader(file_uploader) 3924 if emit_connector_builder_messages 3925 else file_uploader 3926 ) 3927 3928 def create_moving_window_call_rate_policy( 3929 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 3930 ) -> MovingWindowCallRatePolicy: 3931 rates = [ 3932 self._create_component_from_model(model=rate, config=config) for rate in model.rates 3933 ] 3934 matchers = [ 3935 self._create_component_from_model(model=matcher, config=config) 3936 for matcher in model.matchers 3937 ] 3938 return MovingWindowCallRatePolicy( 3939 rates=rates, 3940 matchers=matchers, 3941 ) 3942 3943 def create_unlimited_call_rate_policy( 3944 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 3945 ) -> UnlimitedCallRatePolicy: 3946 matchers = [ 3947 self._create_component_from_model(model=matcher, config=config) 3948 for matcher in model.matchers 3949 ] 3950 3951 return UnlimitedCallRatePolicy( 3952 matchers=matchers, 3953 ) 3954 3955 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 3956 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 3957 return Rate( 3958 limit=int(interpolated_limit.eval(config=config)), 3959 interval=parse_duration(model.interval), 3960 ) 3961 3962 def create_http_request_matcher( 3963 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 3964 ) -> HttpRequestRegexMatcher: 3965 return HttpRequestRegexMatcher( 3966 method=model.method, 3967 url_base=model.url_base, 3968 url_path_pattern=model.url_path_pattern, 3969 params=model.params, 3970 headers=model.headers, 3971 ) 3972 3973 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 3974 self._api_budget = self.create_component( 3975 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 3976 ) 3977 3978 def create_grouping_partition_router( 3979 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 3980 ) -> GroupingPartitionRouter: 3981 underlying_router = self._create_component_from_model( 3982 model=model.underlying_partition_router, config=config 3983 ) 3984 if model.group_size < 1: 3985 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 3986 3987 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 3988 # because they are specific to individual partitions and cannot be aggregated or handled 3989 # when grouping, potentially leading to incorrect API calls. Any request customization 3990 # should be managed at the stream level through the requester's configuration. 3991 if isinstance(underlying_router, SubstreamPartitionRouter): 3992 if any( 3993 parent_config.request_option 3994 for parent_config in underlying_router.parent_stream_configs 3995 ): 3996 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 3997 3998 if isinstance(underlying_router, ListPartitionRouter): 3999 if underlying_router.request_option: 4000 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4001 4002 return GroupingPartitionRouter( 4003 group_size=model.group_size, 4004 underlying_partition_router=underlying_router, 4005 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4006 config=config, 4007 )
615class ModelToComponentFactory: 616 EPOCH_DATETIME_FORMAT = "%s" 617 618 def __init__( 619 self, 620 limit_pages_fetched_per_slice: Optional[int] = None, 621 limit_slices_fetched: Optional[int] = None, 622 emit_connector_builder_messages: bool = False, 623 disable_retries: bool = False, 624 disable_cache: bool = False, 625 disable_resumable_full_refresh: bool = False, 626 message_repository: Optional[MessageRepository] = None, 627 connector_state_manager: Optional[ConnectorStateManager] = None, 628 max_concurrent_async_job_count: Optional[int] = None, 629 ): 630 self._init_mappings() 631 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 632 self._limit_slices_fetched = limit_slices_fetched 633 self._emit_connector_builder_messages = emit_connector_builder_messages 634 self._disable_retries = disable_retries 635 self._disable_cache = disable_cache 636 self._disable_resumable_full_refresh = disable_resumable_full_refresh 637 self._message_repository = message_repository or InMemoryMessageRepository( 638 self._evaluate_log_level(emit_connector_builder_messages) 639 ) 640 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 641 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 642 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 643 # placeholder for deprecation warnings 644 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 645 646 def _init_mappings(self) -> None: 647 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 648 AddedFieldDefinitionModel: self.create_added_field_definition, 649 AddFieldsModel: self.create_add_fields, 650 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 651 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 652 BearerAuthenticatorModel: self.create_bearer_authenticator, 653 CheckStreamModel: self.create_check_stream, 654 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 655 CheckDynamicStreamModel: self.create_check_dynamic_stream, 656 CompositeErrorHandlerModel: self.create_composite_error_handler, 657 ConcurrencyLevelModel: self.create_concurrency_level, 658 ConfigMigrationModel: self.create_config_migration, 659 ConfigAddFieldsModel: self.create_config_add_fields, 660 ConfigRemapFieldModel: self.create_config_remap_field, 661 ConfigRemoveFieldsModel: self.create_config_remove_fields, 662 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 663 CsvDecoderModel: self.create_csv_decoder, 664 CursorPaginationModel: self.create_cursor_pagination, 665 CustomAuthenticatorModel: self.create_custom_component, 666 CustomBackoffStrategyModel: self.create_custom_component, 667 CustomDecoderModel: self.create_custom_component, 668 CustomErrorHandlerModel: self.create_custom_component, 669 CustomIncrementalSyncModel: self.create_custom_component, 670 CustomRecordExtractorModel: self.create_custom_component, 671 CustomRecordFilterModel: self.create_custom_component, 672 CustomRequesterModel: self.create_custom_component, 673 CustomRetrieverModel: self.create_custom_component, 674 CustomSchemaLoader: self.create_custom_component, 675 CustomSchemaNormalizationModel: self.create_custom_component, 676 CustomStateMigration: self.create_custom_component, 677 CustomPaginationStrategyModel: self.create_custom_component, 678 CustomPartitionRouterModel: self.create_custom_component, 679 CustomTransformationModel: self.create_custom_component, 680 DatetimeBasedCursorModel: self.create_datetime_based_cursor, 681 DeclarativeStreamModel: self.create_declarative_stream, 682 DefaultErrorHandlerModel: self.create_default_error_handler, 683 DefaultPaginatorModel: self.create_default_paginator, 684 DpathExtractorModel: self.create_dpath_extractor, 685 DpathValidatorModel: self.create_dpath_validator, 686 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 687 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 688 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 689 GroupByKeyMergeStrategyModel: self.create_group_by_key, 690 HttpRequesterModel: self.create_http_requester, 691 HttpResponseFilterModel: self.create_http_response_filter, 692 InlineSchemaLoaderModel: self.create_inline_schema_loader, 693 JsonDecoderModel: self.create_json_decoder, 694 JsonlDecoderModel: self.create_jsonl_decoder, 695 GzipDecoderModel: self.create_gzip_decoder, 696 KeysToLowerModel: self.create_keys_to_lower_transformation, 697 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 698 KeysReplaceModel: self.create_keys_replace_transformation, 699 FlattenFieldsModel: self.create_flatten_fields, 700 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 701 IterableDecoderModel: self.create_iterable_decoder, 702 IncrementingCountCursorModel: self.create_incrementing_count_cursor, 703 XmlDecoderModel: self.create_xml_decoder, 704 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 705 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 706 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 707 TypesMapModel: self.create_types_map, 708 ComplexFieldTypeModel: self.create_complex_field_type, 709 JwtAuthenticatorModel: self.create_jwt_authenticator, 710 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 711 ListPartitionRouterModel: self.create_list_partition_router, 712 MinMaxDatetimeModel: self.create_min_max_datetime, 713 NoAuthModel: self.create_no_auth, 714 NoPaginationModel: self.create_no_pagination, 715 OAuthAuthenticatorModel: self.create_oauth_authenticator, 716 OffsetIncrementModel: self.create_offset_increment, 717 PageIncrementModel: self.create_page_increment, 718 ParentStreamConfigModel: self.create_parent_stream_config, 719 PredicateValidatorModel: self.create_predicate_validator, 720 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 721 PropertyChunkingModel: self.create_property_chunking, 722 QueryPropertiesModel: self.create_query_properties, 723 RecordFilterModel: self.create_record_filter, 724 RecordSelectorModel: self.create_record_selector, 725 RemoveFieldsModel: self.create_remove_fields, 726 RequestPathModel: self.create_request_path, 727 RequestOptionModel: self.create_request_option, 728 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 729 SelectiveAuthenticatorModel: self.create_selective_authenticator, 730 SimpleRetrieverModel: self.create_simple_retriever, 731 StateDelegatingStreamModel: self.create_state_delegating_stream, 732 SpecModel: self.create_spec, 733 SubstreamPartitionRouterModel: self.create_substream_partition_router, 734 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 735 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 736 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 737 AsyncRetrieverModel: self.create_async_retriever, 738 HttpComponentsResolverModel: self.create_http_components_resolver, 739 ConfigComponentsResolverModel: self.create_config_components_resolver, 740 StreamConfigModel: self.create_stream_config, 741 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 742 ZipfileDecoderModel: self.create_zipfile_decoder, 743 HTTPAPIBudgetModel: self.create_http_api_budget, 744 FileUploaderModel: self.create_file_uploader, 745 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 746 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 747 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 748 RateModel: self.create_rate, 749 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 750 GroupingPartitionRouterModel: self.create_grouping_partition_router, 751 } 752 753 # Needed for the case where we need to perform a second parse on the fields of a custom component 754 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 755 756 def create_component( 757 self, 758 model_type: Type[BaseModel], 759 component_definition: ComponentDefinition, 760 config: Config, 761 **kwargs: Any, 762 ) -> Any: 763 """ 764 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 765 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 766 creating declarative components from that model. 767 768 :param model_type: The type of declarative component that is being initialized 769 :param component_definition: The mapping that represents a declarative component 770 :param config: The connector config that is provided by the customer 771 :return: The declarative component to be used at runtime 772 """ 773 774 component_type = component_definition.get("type") 775 if component_definition.get("type") != model_type.__name__: 776 raise ValueError( 777 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 778 ) 779 780 declarative_component_model = model_type.parse_obj(component_definition) 781 782 if not isinstance(declarative_component_model, model_type): 783 raise ValueError( 784 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 785 ) 786 787 return self._create_component_from_model( 788 model=declarative_component_model, config=config, **kwargs 789 ) 790 791 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 792 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 793 raise ValueError( 794 f"{model.__class__} with attributes {model} is not a valid component type" 795 ) 796 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 797 if not component_constructor: 798 raise ValueError(f"Could not find constructor for {model.__class__}") 799 800 # collect deprecation warnings for supported models. 801 if isinstance(model, BaseModelWithDeprecations): 802 self._collect_model_deprecations(model) 803 804 return component_constructor(model=model, config=config, **kwargs) 805 806 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 807 """ 808 Returns the deprecation warnings that were collected during the creation of components. 809 """ 810 return self._collected_deprecation_logs 811 812 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 813 """ 814 Collects deprecation logs from the given model and appends any new logs to the internal collection. 815 816 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 817 818 Args: 819 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 820 """ 821 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 822 for log in model._deprecation_logs: 823 # avoid duplicates for deprecation logs observed. 824 if log not in self._collected_deprecation_logs: 825 self._collected_deprecation_logs.append(log) 826 827 def create_config_migration( 828 self, model: ConfigMigrationModel, config: Config 829 ) -> ConfigMigration: 830 transformations: List[ConfigTransformation] = [ 831 self._create_component_from_model(transformation, config) 832 for transformation in model.transformations 833 ] 834 835 return ConfigMigration( 836 description=model.description, 837 transformations=transformations, 838 ) 839 840 def create_config_add_fields( 841 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 842 ) -> ConfigAddFields: 843 fields = [self._create_component_from_model(field, config) for field in model.fields] 844 return ConfigAddFields( 845 fields=fields, 846 condition=model.condition or "", 847 ) 848 849 @staticmethod 850 def create_config_remove_fields( 851 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 852 ) -> ConfigRemoveFields: 853 return ConfigRemoveFields( 854 field_pointers=model.field_pointers, 855 condition=model.condition or "", 856 ) 857 858 @staticmethod 859 def create_config_remap_field( 860 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 861 ) -> ConfigRemapField: 862 mapping = cast(Mapping[str, Any], model.map) 863 return ConfigRemapField( 864 map=mapping, 865 field_path=model.field_path, 866 config=config, 867 ) 868 869 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 870 strategy = self._create_component_from_model(model.validation_strategy, config) 871 872 return DpathValidator( 873 field_path=model.field_path, 874 strategy=strategy, 875 ) 876 877 def create_predicate_validator( 878 self, model: PredicateValidatorModel, config: Config 879 ) -> PredicateValidator: 880 strategy = self._create_component_from_model(model.validation_strategy, config) 881 882 return PredicateValidator( 883 value=model.value, 884 strategy=strategy, 885 ) 886 887 @staticmethod 888 def create_validate_adheres_to_schema( 889 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 890 ) -> ValidateAdheresToSchema: 891 base_schema = cast(Mapping[str, Any], model.base_schema) 892 return ValidateAdheresToSchema( 893 schema=base_schema, 894 ) 895 896 @staticmethod 897 def create_added_field_definition( 898 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 899 ) -> AddedFieldDefinition: 900 interpolated_value = InterpolatedString.create( 901 model.value, parameters=model.parameters or {} 902 ) 903 return AddedFieldDefinition( 904 path=model.path, 905 value=interpolated_value, 906 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 907 parameters=model.parameters or {}, 908 ) 909 910 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 911 added_field_definitions = [ 912 self._create_component_from_model( 913 model=added_field_definition_model, 914 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 915 added_field_definition_model.value_type 916 ), 917 config=config, 918 ) 919 for added_field_definition_model in model.fields 920 ] 921 return AddFields( 922 fields=added_field_definitions, 923 condition=model.condition or "", 924 parameters=model.parameters or {}, 925 ) 926 927 def create_keys_to_lower_transformation( 928 self, model: KeysToLowerModel, config: Config, **kwargs: Any 929 ) -> KeysToLowerTransformation: 930 return KeysToLowerTransformation() 931 932 def create_keys_to_snake_transformation( 933 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 934 ) -> KeysToSnakeCaseTransformation: 935 return KeysToSnakeCaseTransformation() 936 937 def create_keys_replace_transformation( 938 self, model: KeysReplaceModel, config: Config, **kwargs: Any 939 ) -> KeysReplaceTransformation: 940 return KeysReplaceTransformation( 941 old=model.old, new=model.new, parameters=model.parameters or {} 942 ) 943 944 def create_flatten_fields( 945 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 946 ) -> FlattenFields: 947 return FlattenFields( 948 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 949 ) 950 951 def create_dpath_flatten_fields( 952 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 953 ) -> DpathFlattenFields: 954 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 955 key_transformation = ( 956 KeyTransformation( 957 config=config, 958 prefix=model.key_transformation.prefix, 959 suffix=model.key_transformation.suffix, 960 parameters=model.parameters or {}, 961 ) 962 if model.key_transformation is not None 963 else None 964 ) 965 return DpathFlattenFields( 966 config=config, 967 field_path=model_field_path, 968 delete_origin_value=model.delete_origin_value 969 if model.delete_origin_value is not None 970 else False, 971 replace_record=model.replace_record if model.replace_record is not None else False, 972 key_transformation=key_transformation, 973 parameters=model.parameters or {}, 974 ) 975 976 @staticmethod 977 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 978 if not value_type: 979 return None 980 names_to_types = { 981 ValueType.string: str, 982 ValueType.number: float, 983 ValueType.integer: int, 984 ValueType.boolean: bool, 985 } 986 return names_to_types[value_type] 987 988 def create_api_key_authenticator( 989 self, 990 model: ApiKeyAuthenticatorModel, 991 config: Config, 992 token_provider: Optional[TokenProvider] = None, 993 **kwargs: Any, 994 ) -> ApiKeyAuthenticator: 995 if model.inject_into is None and model.header is None: 996 raise ValueError( 997 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 998 ) 999 1000 if model.inject_into is not None and model.header is not None: 1001 raise ValueError( 1002 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1003 ) 1004 1005 if token_provider is not None and model.api_token != "": 1006 raise ValueError( 1007 "If token_provider is set, api_token is ignored and has to be set to empty string." 1008 ) 1009 1010 request_option = ( 1011 self._create_component_from_model( 1012 model.inject_into, config, parameters=model.parameters or {} 1013 ) 1014 if model.inject_into 1015 else RequestOption( 1016 inject_into=RequestOptionType.header, 1017 field_name=model.header or "", 1018 parameters=model.parameters or {}, 1019 ) 1020 ) 1021 1022 return ApiKeyAuthenticator( 1023 token_provider=( 1024 token_provider 1025 if token_provider is not None 1026 else InterpolatedStringTokenProvider( 1027 api_token=model.api_token or "", 1028 config=config, 1029 parameters=model.parameters or {}, 1030 ) 1031 ), 1032 request_option=request_option, 1033 config=config, 1034 parameters=model.parameters or {}, 1035 ) 1036 1037 def create_legacy_to_per_partition_state_migration( 1038 self, 1039 model: LegacyToPerPartitionStateMigrationModel, 1040 config: Mapping[str, Any], 1041 declarative_stream: DeclarativeStreamModel, 1042 ) -> LegacyToPerPartitionStateMigration: 1043 retriever = declarative_stream.retriever 1044 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1045 raise ValueError( 1046 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1047 ) 1048 partition_router = retriever.partition_router 1049 if not isinstance( 1050 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1051 ): 1052 raise ValueError( 1053 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1054 ) 1055 if not hasattr(partition_router, "parent_stream_configs"): 1056 raise ValueError( 1057 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1058 ) 1059 1060 if not hasattr(declarative_stream, "incremental_sync"): 1061 raise ValueError( 1062 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1063 ) 1064 1065 return LegacyToPerPartitionStateMigration( 1066 partition_router, # type: ignore # was already checked above 1067 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1068 config, 1069 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1070 ) 1071 1072 def create_session_token_authenticator( 1073 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1074 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1075 decoder = ( 1076 self._create_component_from_model(model=model.decoder, config=config) 1077 if model.decoder 1078 else JsonDecoder(parameters={}) 1079 ) 1080 login_requester = self._create_component_from_model( 1081 model=model.login_requester, 1082 config=config, 1083 name=f"{name}_login_requester", 1084 decoder=decoder, 1085 ) 1086 token_provider = SessionTokenProvider( 1087 login_requester=login_requester, 1088 session_token_path=model.session_token_path, 1089 expiration_duration=parse_duration(model.expiration_duration) 1090 if model.expiration_duration 1091 else None, 1092 parameters=model.parameters or {}, 1093 message_repository=self._message_repository, 1094 decoder=decoder, 1095 ) 1096 if model.request_authentication.type == "Bearer": 1097 return ModelToComponentFactory.create_bearer_authenticator( 1098 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1099 config, 1100 token_provider=token_provider, 1101 ) 1102 else: 1103 return self.create_api_key_authenticator( 1104 ApiKeyAuthenticatorModel( 1105 type="ApiKeyAuthenticator", 1106 api_token="", 1107 inject_into=model.request_authentication.inject_into, 1108 ), # type: ignore # $parameters and headers default to None 1109 config=config, 1110 token_provider=token_provider, 1111 ) 1112 1113 @staticmethod 1114 def create_basic_http_authenticator( 1115 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1116 ) -> BasicHttpAuthenticator: 1117 return BasicHttpAuthenticator( 1118 password=model.password or "", 1119 username=model.username, 1120 config=config, 1121 parameters=model.parameters or {}, 1122 ) 1123 1124 @staticmethod 1125 def create_bearer_authenticator( 1126 model: BearerAuthenticatorModel, 1127 config: Config, 1128 token_provider: Optional[TokenProvider] = None, 1129 **kwargs: Any, 1130 ) -> BearerAuthenticator: 1131 if token_provider is not None and model.api_token != "": 1132 raise ValueError( 1133 "If token_provider is set, api_token is ignored and has to be set to empty string." 1134 ) 1135 return BearerAuthenticator( 1136 token_provider=( 1137 token_provider 1138 if token_provider is not None 1139 else InterpolatedStringTokenProvider( 1140 api_token=model.api_token or "", 1141 config=config, 1142 parameters=model.parameters or {}, 1143 ) 1144 ), 1145 config=config, 1146 parameters=model.parameters or {}, 1147 ) 1148 1149 @staticmethod 1150 def create_dynamic_stream_check_config( 1151 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1152 ) -> DynamicStreamCheckConfig: 1153 return DynamicStreamCheckConfig( 1154 dynamic_stream_name=model.dynamic_stream_name, 1155 stream_count=model.stream_count or 0, 1156 ) 1157 1158 def create_check_stream( 1159 self, model: CheckStreamModel, config: Config, **kwargs: Any 1160 ) -> CheckStream: 1161 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1162 raise ValueError( 1163 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1164 ) 1165 1166 dynamic_streams_check_configs = ( 1167 [ 1168 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1169 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1170 ] 1171 if model.dynamic_streams_check_configs 1172 else [] 1173 ) 1174 1175 return CheckStream( 1176 stream_names=model.stream_names or [], 1177 dynamic_streams_check_configs=dynamic_streams_check_configs, 1178 parameters={}, 1179 ) 1180 1181 @staticmethod 1182 def create_check_dynamic_stream( 1183 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1184 ) -> CheckDynamicStream: 1185 assert model.use_check_availability is not None # for mypy 1186 1187 use_check_availability = model.use_check_availability 1188 1189 return CheckDynamicStream( 1190 stream_count=model.stream_count, 1191 use_check_availability=use_check_availability, 1192 parameters={}, 1193 ) 1194 1195 def create_composite_error_handler( 1196 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1197 ) -> CompositeErrorHandler: 1198 error_handlers = [ 1199 self._create_component_from_model(model=error_handler_model, config=config) 1200 for error_handler_model in model.error_handlers 1201 ] 1202 return CompositeErrorHandler( 1203 error_handlers=error_handlers, parameters=model.parameters or {} 1204 ) 1205 1206 @staticmethod 1207 def create_concurrency_level( 1208 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1209 ) -> ConcurrencyLevel: 1210 return ConcurrencyLevel( 1211 default_concurrency=model.default_concurrency, 1212 max_concurrency=model.max_concurrency, 1213 config=config, 1214 parameters={}, 1215 ) 1216 1217 @staticmethod 1218 def apply_stream_state_migrations( 1219 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1220 ) -> MutableMapping[str, Any]: 1221 if stream_state_migrations: 1222 for state_migration in stream_state_migrations: 1223 if state_migration.should_migrate(stream_state): 1224 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1225 stream_state = dict(state_migration.migrate(stream_state)) 1226 return stream_state 1227 1228 def create_concurrent_cursor_from_datetime_based_cursor( 1229 self, 1230 model_type: Type[BaseModel], 1231 component_definition: ComponentDefinition, 1232 stream_name: str, 1233 stream_namespace: Optional[str], 1234 config: Config, 1235 message_repository: Optional[MessageRepository] = None, 1236 runtime_lookback_window: Optional[datetime.timedelta] = None, 1237 stream_state_migrations: Optional[List[Any]] = None, 1238 **kwargs: Any, 1239 ) -> ConcurrentCursor: 1240 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1241 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1242 # incoming state and connector_state_manager that is initialized when the component factory is created 1243 stream_state = ( 1244 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1245 if "stream_state" not in kwargs 1246 else kwargs["stream_state"] 1247 ) 1248 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1249 1250 component_type = component_definition.get("type") 1251 if component_definition.get("type") != model_type.__name__: 1252 raise ValueError( 1253 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1254 ) 1255 1256 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1257 1258 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1259 raise ValueError( 1260 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1261 ) 1262 1263 interpolated_cursor_field = InterpolatedString.create( 1264 datetime_based_cursor_model.cursor_field, 1265 parameters=datetime_based_cursor_model.parameters or {}, 1266 ) 1267 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1268 1269 interpolated_partition_field_start = InterpolatedString.create( 1270 datetime_based_cursor_model.partition_field_start or "start_time", 1271 parameters=datetime_based_cursor_model.parameters or {}, 1272 ) 1273 interpolated_partition_field_end = InterpolatedString.create( 1274 datetime_based_cursor_model.partition_field_end or "end_time", 1275 parameters=datetime_based_cursor_model.parameters or {}, 1276 ) 1277 1278 slice_boundary_fields = ( 1279 interpolated_partition_field_start.eval(config=config), 1280 interpolated_partition_field_end.eval(config=config), 1281 ) 1282 1283 datetime_format = datetime_based_cursor_model.datetime_format 1284 1285 cursor_granularity = ( 1286 parse_duration(datetime_based_cursor_model.cursor_granularity) 1287 if datetime_based_cursor_model.cursor_granularity 1288 else None 1289 ) 1290 1291 lookback_window = None 1292 interpolated_lookback_window = ( 1293 InterpolatedString.create( 1294 datetime_based_cursor_model.lookback_window, 1295 parameters=datetime_based_cursor_model.parameters or {}, 1296 ) 1297 if datetime_based_cursor_model.lookback_window 1298 else None 1299 ) 1300 if interpolated_lookback_window: 1301 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1302 if evaluated_lookback_window: 1303 lookback_window = parse_duration(evaluated_lookback_window) 1304 1305 connector_state_converter: DateTimeStreamStateConverter 1306 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1307 datetime_format=datetime_format, 1308 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1309 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1310 cursor_granularity=cursor_granularity, 1311 ) 1312 1313 # Adjusts the stream state by applying the runtime lookback window. 1314 # This is used to ensure correct state handling in case of failed partitions. 1315 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1316 if runtime_lookback_window and stream_state_value: 1317 new_stream_state = ( 1318 connector_state_converter.parse_timestamp(stream_state_value) 1319 - runtime_lookback_window 1320 ) 1321 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1322 new_stream_state 1323 ) 1324 1325 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1326 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1327 start_date_runtime_value = self.create_min_max_datetime( 1328 model=datetime_based_cursor_model.start_datetime, config=config 1329 ) 1330 else: 1331 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1332 1333 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1334 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1335 end_date_runtime_value = self.create_min_max_datetime( 1336 model=datetime_based_cursor_model.end_datetime, config=config 1337 ) 1338 else: 1339 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1340 1341 interpolated_start_date = MinMaxDatetime.create( 1342 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1343 parameters=datetime_based_cursor_model.parameters, 1344 ) 1345 interpolated_end_date = ( 1346 None 1347 if not end_date_runtime_value 1348 else MinMaxDatetime.create( 1349 end_date_runtime_value, datetime_based_cursor_model.parameters 1350 ) 1351 ) 1352 1353 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1354 if not interpolated_start_date.datetime_format: 1355 interpolated_start_date.datetime_format = datetime_format 1356 if interpolated_end_date and not interpolated_end_date.datetime_format: 1357 interpolated_end_date.datetime_format = datetime_format 1358 1359 start_date = interpolated_start_date.get_datetime(config=config) 1360 end_date_provider = ( 1361 partial(interpolated_end_date.get_datetime, config) 1362 if interpolated_end_date 1363 else connector_state_converter.get_end_provider() 1364 ) 1365 1366 if ( 1367 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1368 ) or ( 1369 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1370 ): 1371 raise ValueError( 1372 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1373 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1374 ) 1375 1376 # When step is not defined, default to a step size from the starting date to the present moment 1377 step_length = datetime.timedelta.max 1378 interpolated_step = ( 1379 InterpolatedString.create( 1380 datetime_based_cursor_model.step, 1381 parameters=datetime_based_cursor_model.parameters or {}, 1382 ) 1383 if datetime_based_cursor_model.step 1384 else None 1385 ) 1386 if interpolated_step: 1387 evaluated_step = interpolated_step.eval(config) 1388 if evaluated_step: 1389 step_length = parse_duration(evaluated_step) 1390 1391 clamping_strategy: ClampingStrategy = NoClamping() 1392 if datetime_based_cursor_model.clamping: 1393 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1394 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1395 # object which we want to keep agnostic of being low-code 1396 target = InterpolatedString( 1397 string=datetime_based_cursor_model.clamping.target, 1398 parameters=datetime_based_cursor_model.parameters or {}, 1399 ) 1400 evaluated_target = target.eval(config=config) 1401 match evaluated_target: 1402 case "DAY": 1403 clamping_strategy = DayClampingStrategy() 1404 end_date_provider = ClampingEndProvider( 1405 DayClampingStrategy(is_ceiling=False), 1406 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1407 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1408 ) 1409 case "WEEK": 1410 if ( 1411 not datetime_based_cursor_model.clamping.target_details 1412 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1413 ): 1414 raise ValueError( 1415 "Given WEEK clamping, weekday needs to be provided as target_details" 1416 ) 1417 weekday = self._assemble_weekday( 1418 datetime_based_cursor_model.clamping.target_details["weekday"] 1419 ) 1420 clamping_strategy = WeekClampingStrategy(weekday) 1421 end_date_provider = ClampingEndProvider( 1422 WeekClampingStrategy(weekday, is_ceiling=False), 1423 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1424 granularity=cursor_granularity or datetime.timedelta(days=1), 1425 ) 1426 case "MONTH": 1427 clamping_strategy = MonthClampingStrategy() 1428 end_date_provider = ClampingEndProvider( 1429 MonthClampingStrategy(is_ceiling=False), 1430 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1431 granularity=cursor_granularity or datetime.timedelta(days=1), 1432 ) 1433 case _: 1434 raise ValueError( 1435 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1436 ) 1437 1438 return ConcurrentCursor( 1439 stream_name=stream_name, 1440 stream_namespace=stream_namespace, 1441 stream_state=stream_state, 1442 message_repository=message_repository or self._message_repository, 1443 connector_state_manager=self._connector_state_manager, 1444 connector_state_converter=connector_state_converter, 1445 cursor_field=cursor_field, 1446 slice_boundary_fields=slice_boundary_fields, 1447 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1448 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1449 lookback_window=lookback_window, 1450 slice_range=step_length, 1451 cursor_granularity=cursor_granularity, 1452 clamping_strategy=clamping_strategy, 1453 ) 1454 1455 def create_concurrent_cursor_from_incrementing_count_cursor( 1456 self, 1457 model_type: Type[BaseModel], 1458 component_definition: ComponentDefinition, 1459 stream_name: str, 1460 stream_namespace: Optional[str], 1461 config: Config, 1462 message_repository: Optional[MessageRepository] = None, 1463 **kwargs: Any, 1464 ) -> ConcurrentCursor: 1465 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1466 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1467 # incoming state and connector_state_manager that is initialized when the component factory is created 1468 stream_state = ( 1469 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1470 if "stream_state" not in kwargs 1471 else kwargs["stream_state"] 1472 ) 1473 1474 component_type = component_definition.get("type") 1475 if component_definition.get("type") != model_type.__name__: 1476 raise ValueError( 1477 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1478 ) 1479 1480 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1481 1482 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1483 raise ValueError( 1484 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1485 ) 1486 1487 interpolated_start_value = ( 1488 InterpolatedString.create( 1489 incrementing_count_cursor_model.start_value, # type: ignore 1490 parameters=incrementing_count_cursor_model.parameters or {}, 1491 ) 1492 if incrementing_count_cursor_model.start_value 1493 else 0 1494 ) 1495 1496 interpolated_cursor_field = InterpolatedString.create( 1497 incrementing_count_cursor_model.cursor_field, 1498 parameters=incrementing_count_cursor_model.parameters or {}, 1499 ) 1500 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1501 1502 connector_state_converter = IncrementingCountStreamStateConverter( 1503 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1504 ) 1505 1506 return ConcurrentCursor( 1507 stream_name=stream_name, 1508 stream_namespace=stream_namespace, 1509 stream_state=stream_state, 1510 message_repository=message_repository or self._message_repository, 1511 connector_state_manager=self._connector_state_manager, 1512 connector_state_converter=connector_state_converter, 1513 cursor_field=cursor_field, 1514 slice_boundary_fields=None, 1515 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1516 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1517 ) 1518 1519 def _assemble_weekday(self, weekday: str) -> Weekday: 1520 match weekday: 1521 case "MONDAY": 1522 return Weekday.MONDAY 1523 case "TUESDAY": 1524 return Weekday.TUESDAY 1525 case "WEDNESDAY": 1526 return Weekday.WEDNESDAY 1527 case "THURSDAY": 1528 return Weekday.THURSDAY 1529 case "FRIDAY": 1530 return Weekday.FRIDAY 1531 case "SATURDAY": 1532 return Weekday.SATURDAY 1533 case "SUNDAY": 1534 return Weekday.SUNDAY 1535 case _: 1536 raise ValueError(f"Unknown weekday {weekday}") 1537 1538 def create_concurrent_cursor_from_perpartition_cursor( 1539 self, 1540 state_manager: ConnectorStateManager, 1541 model_type: Type[BaseModel], 1542 component_definition: ComponentDefinition, 1543 stream_name: str, 1544 stream_namespace: Optional[str], 1545 config: Config, 1546 stream_state: MutableMapping[str, Any], 1547 partition_router: PartitionRouter, 1548 stream_state_migrations: Optional[List[Any]] = None, 1549 **kwargs: Any, 1550 ) -> ConcurrentPerPartitionCursor: 1551 component_type = component_definition.get("type") 1552 if component_definition.get("type") != model_type.__name__: 1553 raise ValueError( 1554 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1555 ) 1556 1557 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1558 1559 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1560 raise ValueError( 1561 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1562 ) 1563 1564 interpolated_cursor_field = InterpolatedString.create( 1565 datetime_based_cursor_model.cursor_field, 1566 parameters=datetime_based_cursor_model.parameters or {}, 1567 ) 1568 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1569 1570 datetime_format = datetime_based_cursor_model.datetime_format 1571 1572 cursor_granularity = ( 1573 parse_duration(datetime_based_cursor_model.cursor_granularity) 1574 if datetime_based_cursor_model.cursor_granularity 1575 else None 1576 ) 1577 1578 connector_state_converter: DateTimeStreamStateConverter 1579 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1580 datetime_format=datetime_format, 1581 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1582 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1583 cursor_granularity=cursor_granularity, 1584 ) 1585 1586 # Create the cursor factory 1587 cursor_factory = ConcurrentCursorFactory( 1588 partial( 1589 self.create_concurrent_cursor_from_datetime_based_cursor, 1590 state_manager=state_manager, 1591 model_type=model_type, 1592 component_definition=component_definition, 1593 stream_name=stream_name, 1594 stream_namespace=stream_namespace, 1595 config=config, 1596 message_repository=NoopMessageRepository(), 1597 stream_state_migrations=stream_state_migrations, 1598 ) 1599 ) 1600 1601 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1602 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1603 use_global_cursor = isinstance( 1604 partition_router, GroupingPartitionRouter 1605 ) or component_definition.get("global_substream_cursor", False) 1606 1607 # Return the concurrent cursor and state converter 1608 return ConcurrentPerPartitionCursor( 1609 cursor_factory=cursor_factory, 1610 partition_router=partition_router, 1611 stream_name=stream_name, 1612 stream_namespace=stream_namespace, 1613 stream_state=stream_state, 1614 message_repository=self._message_repository, # type: ignore 1615 connector_state_manager=state_manager, 1616 connector_state_converter=connector_state_converter, 1617 cursor_field=cursor_field, 1618 use_global_cursor=use_global_cursor, 1619 ) 1620 1621 @staticmethod 1622 def create_constant_backoff_strategy( 1623 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1624 ) -> ConstantBackoffStrategy: 1625 return ConstantBackoffStrategy( 1626 backoff_time_in_seconds=model.backoff_time_in_seconds, 1627 config=config, 1628 parameters=model.parameters or {}, 1629 ) 1630 1631 def create_cursor_pagination( 1632 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1633 ) -> CursorPaginationStrategy: 1634 if isinstance(decoder, PaginationDecoderDecorator): 1635 inner_decoder = decoder.decoder 1636 else: 1637 inner_decoder = decoder 1638 decoder = PaginationDecoderDecorator(decoder=decoder) 1639 1640 if self._is_supported_decoder_for_pagination(inner_decoder): 1641 decoder_to_use = decoder 1642 else: 1643 raise ValueError( 1644 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1645 ) 1646 1647 return CursorPaginationStrategy( 1648 cursor_value=model.cursor_value, 1649 decoder=decoder_to_use, 1650 page_size=model.page_size, 1651 stop_condition=model.stop_condition, 1652 config=config, 1653 parameters=model.parameters or {}, 1654 ) 1655 1656 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1657 """ 1658 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1659 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1660 :param model: The Pydantic model of the custom component being created 1661 :param config: The custom defined connector config 1662 :return: The declarative component built from the Pydantic model to be used at runtime 1663 """ 1664 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1665 component_fields = get_type_hints(custom_component_class) 1666 model_args = model.dict() 1667 model_args["config"] = config 1668 1669 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1670 # we defer to these arguments over the component's definition 1671 for key, arg in kwargs.items(): 1672 model_args[key] = arg 1673 1674 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1675 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1676 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1677 for model_field, model_value in model_args.items(): 1678 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1679 if ( 1680 isinstance(model_value, dict) 1681 and "type" not in model_value 1682 and model_field in component_fields 1683 ): 1684 derived_type = self._derive_component_type_from_type_hints( 1685 component_fields.get(model_field) 1686 ) 1687 if derived_type: 1688 model_value["type"] = derived_type 1689 1690 if self._is_component(model_value): 1691 model_args[model_field] = self._create_nested_component( 1692 model, model_field, model_value, config 1693 ) 1694 elif isinstance(model_value, list): 1695 vals = [] 1696 for v in model_value: 1697 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1698 derived_type = self._derive_component_type_from_type_hints( 1699 component_fields.get(model_field) 1700 ) 1701 if derived_type: 1702 v["type"] = derived_type 1703 if self._is_component(v): 1704 vals.append(self._create_nested_component(model, model_field, v, config)) 1705 else: 1706 vals.append(v) 1707 model_args[model_field] = vals 1708 1709 kwargs = { 1710 class_field: model_args[class_field] 1711 for class_field in component_fields.keys() 1712 if class_field in model_args 1713 } 1714 return custom_component_class(**kwargs) 1715 1716 @staticmethod 1717 def _get_class_from_fully_qualified_class_name( 1718 full_qualified_class_name: str, 1719 ) -> Any: 1720 """Get a class from its fully qualified name. 1721 1722 If a custom components module is needed, we assume it is already registered - probably 1723 as `source_declarative_manifest.components` or `components`. 1724 1725 Args: 1726 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1727 1728 Returns: 1729 Any: The class object. 1730 1731 Raises: 1732 ValueError: If the class cannot be loaded. 1733 """ 1734 split = full_qualified_class_name.split(".") 1735 module_name_full = ".".join(split[:-1]) 1736 class_name = split[-1] 1737 1738 try: 1739 module_ref = importlib.import_module(module_name_full) 1740 except ModuleNotFoundError as e: 1741 if split[0] == "source_declarative_manifest": 1742 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1743 try: 1744 import os 1745 1746 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1747 module_ref = importlib.import_module( 1748 module_name_with_source_declarative_manifest 1749 ) 1750 except ModuleNotFoundError: 1751 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1752 else: 1753 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1754 1755 try: 1756 return getattr(module_ref, class_name) 1757 except AttributeError as e: 1758 raise ValueError( 1759 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1760 ) from e 1761 1762 @staticmethod 1763 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1764 interface = field_type 1765 while True: 1766 origin = get_origin(interface) 1767 if origin: 1768 # Unnest types until we reach the raw type 1769 # List[T] -> T 1770 # Optional[List[T]] -> T 1771 args = get_args(interface) 1772 interface = args[0] 1773 else: 1774 break 1775 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1776 return interface.__name__ 1777 return None 1778 1779 @staticmethod 1780 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1781 if not cls: 1782 return False 1783 return cls.__module__ == "builtins" 1784 1785 @staticmethod 1786 def _extract_missing_parameters(error: TypeError) -> List[str]: 1787 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1788 if parameter_search: 1789 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1790 else: 1791 return [] 1792 1793 def _create_nested_component( 1794 self, model: Any, model_field: str, model_value: Any, config: Config 1795 ) -> Any: 1796 type_name = model_value.get("type", None) 1797 if not type_name: 1798 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1799 return model_value 1800 1801 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1802 if model_type: 1803 parsed_model = model_type.parse_obj(model_value) 1804 try: 1805 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1806 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1807 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1808 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1809 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1810 # are needed by a component and could not be shared. 1811 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1812 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1813 model_parameters = model_value.get("$parameters", {}) 1814 matching_parameters = { 1815 kwarg: model_parameters[kwarg] 1816 for kwarg in constructor_kwargs 1817 if kwarg in model_parameters 1818 } 1819 return self._create_component_from_model( 1820 model=parsed_model, config=config, **matching_parameters 1821 ) 1822 except TypeError as error: 1823 missing_parameters = self._extract_missing_parameters(error) 1824 if missing_parameters: 1825 raise ValueError( 1826 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1827 + ", ".join( 1828 ( 1829 f"{type_name}.$parameters.{parameter}" 1830 for parameter in missing_parameters 1831 ) 1832 ) 1833 ) 1834 raise TypeError( 1835 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1836 ) 1837 else: 1838 raise ValueError( 1839 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1840 ) 1841 1842 @staticmethod 1843 def _is_component(model_value: Any) -> bool: 1844 return isinstance(model_value, dict) and model_value.get("type") is not None 1845 1846 def create_datetime_based_cursor( 1847 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1848 ) -> DatetimeBasedCursor: 1849 start_datetime: Union[str, MinMaxDatetime] = ( 1850 model.start_datetime 1851 if isinstance(model.start_datetime, str) 1852 else self.create_min_max_datetime(model.start_datetime, config) 1853 ) 1854 end_datetime: Union[str, MinMaxDatetime, None] = None 1855 if model.is_data_feed and model.end_datetime: 1856 raise ValueError("Data feed does not support end_datetime") 1857 if model.is_data_feed and model.is_client_side_incremental: 1858 raise ValueError( 1859 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1860 ) 1861 if model.end_datetime: 1862 end_datetime = ( 1863 model.end_datetime 1864 if isinstance(model.end_datetime, str) 1865 else self.create_min_max_datetime(model.end_datetime, config) 1866 ) 1867 1868 end_time_option = ( 1869 self._create_component_from_model( 1870 model.end_time_option, config, parameters=model.parameters or {} 1871 ) 1872 if model.end_time_option 1873 else None 1874 ) 1875 start_time_option = ( 1876 self._create_component_from_model( 1877 model.start_time_option, config, parameters=model.parameters or {} 1878 ) 1879 if model.start_time_option 1880 else None 1881 ) 1882 1883 return DatetimeBasedCursor( 1884 cursor_field=model.cursor_field, 1885 cursor_datetime_formats=model.cursor_datetime_formats 1886 if model.cursor_datetime_formats 1887 else [], 1888 cursor_granularity=model.cursor_granularity, 1889 datetime_format=model.datetime_format, 1890 end_datetime=end_datetime, 1891 start_datetime=start_datetime, 1892 step=model.step, 1893 end_time_option=end_time_option, 1894 lookback_window=model.lookback_window, 1895 start_time_option=start_time_option, 1896 partition_field_end=model.partition_field_end, 1897 partition_field_start=model.partition_field_start, 1898 message_repository=self._message_repository, 1899 is_compare_strictly=model.is_compare_strictly, 1900 config=config, 1901 parameters=model.parameters or {}, 1902 ) 1903 1904 def create_declarative_stream( 1905 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1906 ) -> DeclarativeStream: 1907 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1908 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1909 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1910 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1911 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1912 1913 primary_key = model.primary_key.__root__ if model.primary_key else None 1914 stop_condition_on_cursor = ( 1915 model.incremental_sync 1916 and hasattr(model.incremental_sync, "is_data_feed") 1917 and model.incremental_sync.is_data_feed 1918 ) 1919 client_side_incremental_sync = None 1920 if ( 1921 model.incremental_sync 1922 and hasattr(model.incremental_sync, "is_client_side_incremental") 1923 and model.incremental_sync.is_client_side_incremental 1924 ): 1925 supported_slicers = ( 1926 DatetimeBasedCursor, 1927 GlobalSubstreamCursor, 1928 PerPartitionWithGlobalCursor, 1929 ) 1930 if combined_slicers and not isinstance(combined_slicers, supported_slicers): 1931 raise ValueError( 1932 "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead" 1933 ) 1934 cursor = ( 1935 combined_slicers 1936 if isinstance( 1937 combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor) 1938 ) 1939 else self._create_component_from_model(model=model.incremental_sync, config=config) 1940 ) 1941 1942 client_side_incremental_sync = {"cursor": cursor} 1943 1944 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1945 cursor_model = model.incremental_sync 1946 1947 end_time_option = ( 1948 self._create_component_from_model( 1949 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1950 ) 1951 if cursor_model.end_time_option 1952 else None 1953 ) 1954 start_time_option = ( 1955 self._create_component_from_model( 1956 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1957 ) 1958 if cursor_model.start_time_option 1959 else None 1960 ) 1961 1962 request_options_provider = DatetimeBasedRequestOptionsProvider( 1963 start_time_option=start_time_option, 1964 end_time_option=end_time_option, 1965 partition_field_start=cursor_model.partition_field_end, 1966 partition_field_end=cursor_model.partition_field_end, 1967 config=config, 1968 parameters=model.parameters or {}, 1969 ) 1970 elif model.incremental_sync and isinstance( 1971 model.incremental_sync, IncrementingCountCursorModel 1972 ): 1973 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1974 1975 start_time_option = ( 1976 self._create_component_from_model( 1977 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1978 config, 1979 parameters=cursor_model.parameters or {}, 1980 ) 1981 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1982 else None 1983 ) 1984 1985 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1986 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1987 partition_field_start = "start" 1988 1989 request_options_provider = DatetimeBasedRequestOptionsProvider( 1990 start_time_option=start_time_option, 1991 partition_field_start=partition_field_start, 1992 config=config, 1993 parameters=model.parameters or {}, 1994 ) 1995 else: 1996 request_options_provider = None 1997 1998 transformations = [] 1999 if model.transformations: 2000 for transformation_model in model.transformations: 2001 transformations.append( 2002 self._create_component_from_model(model=transformation_model, config=config) 2003 ) 2004 file_uploader = None 2005 if model.file_uploader: 2006 file_uploader = self._create_component_from_model( 2007 model=model.file_uploader, config=config 2008 ) 2009 2010 retriever = self._create_component_from_model( 2011 model=model.retriever, 2012 config=config, 2013 name=model.name, 2014 primary_key=primary_key, 2015 stream_slicer=combined_slicers, 2016 request_options_provider=request_options_provider, 2017 stop_condition_on_cursor=stop_condition_on_cursor, 2018 client_side_incremental_sync=client_side_incremental_sync, 2019 transformations=transformations, 2020 file_uploader=file_uploader, 2021 incremental_sync=model.incremental_sync, 2022 ) 2023 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 2024 2025 if model.state_migrations: 2026 state_transformations = [ 2027 self._create_component_from_model(state_migration, config, declarative_stream=model) 2028 for state_migration in model.state_migrations 2029 ] 2030 else: 2031 state_transformations = [] 2032 2033 schema_loader: Union[ 2034 CompositeSchemaLoader, 2035 DefaultSchemaLoader, 2036 DynamicSchemaLoader, 2037 InlineSchemaLoader, 2038 JsonFileSchemaLoader, 2039 ] 2040 if model.schema_loader and isinstance(model.schema_loader, list): 2041 nested_schema_loaders = [ 2042 self._create_component_from_model(model=nested_schema_loader, config=config) 2043 for nested_schema_loader in model.schema_loader 2044 ] 2045 schema_loader = CompositeSchemaLoader( 2046 schema_loaders=nested_schema_loaders, parameters={} 2047 ) 2048 elif model.schema_loader: 2049 schema_loader = self._create_component_from_model( 2050 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2051 config=config, 2052 ) 2053 else: 2054 options = model.parameters or {} 2055 if "name" not in options: 2056 options["name"] = model.name 2057 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2058 2059 return DeclarativeStream( 2060 name=model.name or "", 2061 primary_key=primary_key, 2062 retriever=retriever, 2063 schema_loader=schema_loader, 2064 stream_cursor_field=cursor_field or "", 2065 state_migrations=state_transformations, 2066 config=config, 2067 parameters=model.parameters or {}, 2068 ) 2069 2070 def _build_stream_slicer_from_partition_router( 2071 self, 2072 model: Union[ 2073 AsyncRetrieverModel, 2074 CustomRetrieverModel, 2075 SimpleRetrieverModel, 2076 ], 2077 config: Config, 2078 stream_name: Optional[str] = None, 2079 ) -> Optional[PartitionRouter]: 2080 if ( 2081 hasattr(model, "partition_router") 2082 and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel) 2083 and model.partition_router 2084 ): 2085 stream_slicer_model = model.partition_router 2086 if isinstance(stream_slicer_model, list): 2087 return CartesianProductStreamSlicer( 2088 [ 2089 self._create_component_from_model( 2090 model=slicer, config=config, stream_name=stream_name or "" 2091 ) 2092 for slicer in stream_slicer_model 2093 ], 2094 parameters={}, 2095 ) 2096 else: 2097 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2098 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2099 ) 2100 return None 2101 2102 def _build_incremental_cursor( 2103 self, 2104 model: DeclarativeStreamModel, 2105 stream_slicer: Optional[PartitionRouter], 2106 config: Config, 2107 ) -> Optional[StreamSlicer]: 2108 if model.incremental_sync and stream_slicer: 2109 if model.retriever.type == "AsyncRetriever": 2110 stream_name = model.name or "" 2111 stream_namespace = None 2112 stream_state = self._connector_state_manager.get_stream_state( 2113 stream_name, stream_namespace 2114 ) 2115 state_transformations = ( 2116 [ 2117 self._create_component_from_model( 2118 state_migration, config, declarative_stream=model 2119 ) 2120 for state_migration in model.state_migrations 2121 ] 2122 if model.state_migrations 2123 else [] 2124 ) 2125 2126 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2127 state_manager=self._connector_state_manager, 2128 model_type=DatetimeBasedCursorModel, 2129 component_definition=model.incremental_sync.__dict__, 2130 stream_name=stream_name, 2131 stream_namespace=stream_namespace, 2132 config=config or {}, 2133 stream_state=stream_state, 2134 stream_state_migrations=state_transformations, 2135 partition_router=stream_slicer, 2136 ) 2137 2138 incremental_sync_model = model.incremental_sync 2139 cursor_component = self._create_component_from_model( 2140 model=incremental_sync_model, config=config 2141 ) 2142 is_global_cursor = ( 2143 hasattr(incremental_sync_model, "global_substream_cursor") 2144 and incremental_sync_model.global_substream_cursor 2145 ) 2146 2147 if is_global_cursor: 2148 return GlobalSubstreamCursor( 2149 stream_cursor=cursor_component, partition_router=stream_slicer 2150 ) 2151 return PerPartitionWithGlobalCursor( 2152 cursor_factory=CursorFactory( 2153 lambda: self._create_component_from_model( 2154 model=incremental_sync_model, config=config 2155 ), 2156 ), 2157 partition_router=stream_slicer, 2158 stream_cursor=cursor_component, 2159 ) 2160 elif model.incremental_sync: 2161 if model.retriever.type == "AsyncRetriever": 2162 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2163 model_type=DatetimeBasedCursorModel, 2164 component_definition=model.incremental_sync.__dict__, 2165 stream_name=model.name or "", 2166 stream_namespace=None, 2167 config=config or {}, 2168 stream_state_migrations=model.state_migrations, 2169 ) 2170 return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync 2171 return None 2172 2173 def _build_resumable_cursor( 2174 self, 2175 model: Union[ 2176 AsyncRetrieverModel, 2177 CustomRetrieverModel, 2178 SimpleRetrieverModel, 2179 ], 2180 stream_slicer: Optional[PartitionRouter], 2181 ) -> Optional[StreamSlicer]: 2182 if hasattr(model, "paginator") and model.paginator and not stream_slicer: 2183 # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` 2184 return ResumableFullRefreshCursor(parameters={}) 2185 elif stream_slicer: 2186 # For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor` 2187 return PerPartitionCursor( 2188 cursor_factory=CursorFactory( 2189 create_function=partial(ChildPartitionResumableFullRefreshCursor, {}) 2190 ), 2191 partition_router=stream_slicer, 2192 ) 2193 return None 2194 2195 def _merge_stream_slicers( 2196 self, model: DeclarativeStreamModel, config: Config 2197 ) -> Optional[StreamSlicer]: 2198 retriever_model = model.retriever 2199 2200 stream_slicer = self._build_stream_slicer_from_partition_router( 2201 retriever_model, config, stream_name=model.name 2202 ) 2203 2204 if retriever_model.type == "AsyncRetriever": 2205 is_not_datetime_cursor = ( 2206 model.incremental_sync.type != "DatetimeBasedCursor" 2207 if model.incremental_sync 2208 else None 2209 ) 2210 is_partition_router = ( 2211 bool(retriever_model.partition_router) if model.incremental_sync else None 2212 ) 2213 2214 if is_not_datetime_cursor: 2215 # We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the 2216 # support or unordered slices (for example, when we trigger reports for January and February, the report 2217 # in February can be completed first). Once we have support for custom concurrent cursor or have a new 2218 # implementation available in the CDK, we can enable more cursors here. 2219 raise ValueError( 2220 "AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet." 2221 ) 2222 2223 if is_partition_router and not stream_slicer: 2224 # Note that this development is also done in parallel to the per partition development which once merged 2225 # we could support here by calling create_concurrent_cursor_from_perpartition_cursor 2226 raise ValueError("Per partition state is not supported yet for AsyncRetriever.") 2227 2228 if model.incremental_sync: 2229 return self._build_incremental_cursor(model, stream_slicer, config) 2230 2231 return ( 2232 stream_slicer 2233 if self._disable_resumable_full_refresh 2234 else self._build_resumable_cursor(retriever_model, stream_slicer) 2235 ) 2236 2237 def create_default_error_handler( 2238 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2239 ) -> DefaultErrorHandler: 2240 backoff_strategies = [] 2241 if model.backoff_strategies: 2242 for backoff_strategy_model in model.backoff_strategies: 2243 backoff_strategies.append( 2244 self._create_component_from_model(model=backoff_strategy_model, config=config) 2245 ) 2246 2247 response_filters = [] 2248 if model.response_filters: 2249 for response_filter_model in model.response_filters: 2250 response_filters.append( 2251 self._create_component_from_model(model=response_filter_model, config=config) 2252 ) 2253 response_filters.append( 2254 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2255 ) 2256 2257 return DefaultErrorHandler( 2258 backoff_strategies=backoff_strategies, 2259 max_retries=model.max_retries, 2260 response_filters=response_filters, 2261 config=config, 2262 parameters=model.parameters or {}, 2263 ) 2264 2265 def create_default_paginator( 2266 self, 2267 model: DefaultPaginatorModel, 2268 config: Config, 2269 *, 2270 url_base: str, 2271 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2272 decoder: Optional[Decoder] = None, 2273 cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None, 2274 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2275 if decoder: 2276 if self._is_supported_decoder_for_pagination(decoder): 2277 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2278 else: 2279 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2280 else: 2281 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2282 page_size_option = ( 2283 self._create_component_from_model(model=model.page_size_option, config=config) 2284 if model.page_size_option 2285 else None 2286 ) 2287 page_token_option = ( 2288 self._create_component_from_model(model=model.page_token_option, config=config) 2289 if model.page_token_option 2290 else None 2291 ) 2292 pagination_strategy = self._create_component_from_model( 2293 model=model.pagination_strategy, 2294 config=config, 2295 decoder=decoder_to_use, 2296 extractor_model=extractor_model, 2297 ) 2298 if cursor_used_for_stop_condition: 2299 pagination_strategy = StopConditionPaginationStrategyDecorator( 2300 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2301 ) 2302 paginator = DefaultPaginator( 2303 decoder=decoder_to_use, 2304 page_size_option=page_size_option, 2305 page_token_option=page_token_option, 2306 pagination_strategy=pagination_strategy, 2307 url_base=url_base, 2308 config=config, 2309 parameters=model.parameters or {}, 2310 ) 2311 if self._limit_pages_fetched_per_slice: 2312 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2313 return paginator 2314 2315 def create_dpath_extractor( 2316 self, 2317 model: DpathExtractorModel, 2318 config: Config, 2319 decoder: Optional[Decoder] = None, 2320 **kwargs: Any, 2321 ) -> DpathExtractor: 2322 if decoder: 2323 decoder_to_use = decoder 2324 else: 2325 decoder_to_use = JsonDecoder(parameters={}) 2326 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2327 return DpathExtractor( 2328 decoder=decoder_to_use, 2329 field_path=model_field_path, 2330 config=config, 2331 parameters=model.parameters or {}, 2332 ) 2333 2334 @staticmethod 2335 def create_response_to_file_extractor( 2336 model: ResponseToFileExtractorModel, 2337 **kwargs: Any, 2338 ) -> ResponseToFileExtractor: 2339 return ResponseToFileExtractor(parameters=model.parameters or {}) 2340 2341 @staticmethod 2342 def create_exponential_backoff_strategy( 2343 model: ExponentialBackoffStrategyModel, config: Config 2344 ) -> ExponentialBackoffStrategy: 2345 return ExponentialBackoffStrategy( 2346 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2347 ) 2348 2349 @staticmethod 2350 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2351 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2352 2353 def create_http_requester( 2354 self, 2355 model: HttpRequesterModel, 2356 config: Config, 2357 decoder: Decoder = JsonDecoder(parameters={}), 2358 query_properties_key: Optional[str] = None, 2359 use_cache: Optional[bool] = None, 2360 *, 2361 name: str, 2362 ) -> HttpRequester: 2363 authenticator = ( 2364 self._create_component_from_model( 2365 model=model.authenticator, 2366 config=config, 2367 url_base=model.url or model.url_base, 2368 name=name, 2369 decoder=decoder, 2370 ) 2371 if model.authenticator 2372 else None 2373 ) 2374 error_handler = ( 2375 self._create_component_from_model(model=model.error_handler, config=config) 2376 if model.error_handler 2377 else DefaultErrorHandler( 2378 backoff_strategies=[], 2379 response_filters=[], 2380 config=config, 2381 parameters=model.parameters or {}, 2382 ) 2383 ) 2384 2385 api_budget = self._api_budget 2386 2387 # Removes QueryProperties components from the interpolated mappings because it has been designed 2388 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2389 # instead of through jinja interpolation 2390 request_parameters: Optional[Union[str, Mapping[str, str]]] 2391 if isinstance(model.request_parameters, Mapping): 2392 request_parameters = self._remove_query_properties(model.request_parameters) 2393 else: 2394 request_parameters = model.request_parameters 2395 2396 request_options_provider = InterpolatedRequestOptionsProvider( 2397 request_body=model.request_body, 2398 request_body_data=model.request_body_data, 2399 request_body_json=model.request_body_json, 2400 request_headers=model.request_headers, 2401 request_parameters=request_parameters, 2402 query_properties_key=query_properties_key, 2403 config=config, 2404 parameters=model.parameters or {}, 2405 ) 2406 2407 assert model.use_cache is not None # for mypy 2408 assert model.http_method is not None # for mypy 2409 2410 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2411 2412 return HttpRequester( 2413 name=name, 2414 url=model.url, 2415 url_base=model.url_base, 2416 path=model.path, 2417 authenticator=authenticator, 2418 error_handler=error_handler, 2419 api_budget=api_budget, 2420 http_method=HttpMethod[model.http_method.value], 2421 request_options_provider=request_options_provider, 2422 config=config, 2423 disable_retries=self._disable_retries, 2424 parameters=model.parameters or {}, 2425 message_repository=self._message_repository, 2426 use_cache=should_use_cache, 2427 decoder=decoder, 2428 stream_response=decoder.is_stream_response() if decoder else False, 2429 ) 2430 2431 @staticmethod 2432 def create_http_response_filter( 2433 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2434 ) -> HttpResponseFilter: 2435 if model.action: 2436 action = ResponseAction(model.action.value) 2437 else: 2438 action = None 2439 2440 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2441 2442 http_codes = ( 2443 set(model.http_codes) if model.http_codes else set() 2444 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2445 2446 return HttpResponseFilter( 2447 action=action, 2448 failure_type=failure_type, 2449 error_message=model.error_message or "", 2450 error_message_contains=model.error_message_contains or "", 2451 http_codes=http_codes, 2452 predicate=model.predicate or "", 2453 config=config, 2454 parameters=model.parameters or {}, 2455 ) 2456 2457 @staticmethod 2458 def create_inline_schema_loader( 2459 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2460 ) -> InlineSchemaLoader: 2461 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2462 2463 def create_complex_field_type( 2464 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2465 ) -> ComplexFieldType: 2466 items = ( 2467 self._create_component_from_model(model=model.items, config=config) 2468 if isinstance(model.items, ComplexFieldTypeModel) 2469 else model.items 2470 ) 2471 2472 return ComplexFieldType(field_type=model.field_type, items=items) 2473 2474 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2475 target_type = ( 2476 self._create_component_from_model(model=model.target_type, config=config) 2477 if isinstance(model.target_type, ComplexFieldTypeModel) 2478 else model.target_type 2479 ) 2480 2481 return TypesMap( 2482 target_type=target_type, 2483 current_type=model.current_type, 2484 condition=model.condition if model.condition is not None else "True", 2485 ) 2486 2487 def create_schema_type_identifier( 2488 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2489 ) -> SchemaTypeIdentifier: 2490 types_mapping = [] 2491 if model.types_mapping: 2492 types_mapping.extend( 2493 [ 2494 self._create_component_from_model(types_map, config=config) 2495 for types_map in model.types_mapping 2496 ] 2497 ) 2498 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2499 [x for x in model.schema_pointer] if model.schema_pointer else [] 2500 ) 2501 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2502 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2503 [x for x in model.type_pointer] if model.type_pointer else None 2504 ) 2505 2506 return SchemaTypeIdentifier( 2507 schema_pointer=model_schema_pointer, 2508 key_pointer=model_key_pointer, 2509 type_pointer=model_type_pointer, 2510 types_mapping=types_mapping, 2511 parameters=model.parameters or {}, 2512 ) 2513 2514 def create_dynamic_schema_loader( 2515 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2516 ) -> DynamicSchemaLoader: 2517 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2518 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2519 2520 schema_transformations = [] 2521 if model.schema_transformations: 2522 for transformation_model in model.schema_transformations: 2523 schema_transformations.append( 2524 self._create_component_from_model(model=transformation_model, config=config) 2525 ) 2526 name = "dynamic_properties" 2527 retriever = self._create_component_from_model( 2528 model=model.retriever, 2529 config=config, 2530 name=name, 2531 primary_key=None, 2532 stream_slicer=combined_slicers, 2533 transformations=[], 2534 use_cache=True, 2535 log_formatter=( 2536 lambda response: format_http_message( 2537 response, 2538 f"Schema loader '{name}' request", 2539 f"Request performed in order to extract schema.", 2540 name, 2541 is_auxiliary=True, 2542 ) 2543 ), 2544 ) 2545 schema_type_identifier = self._create_component_from_model( 2546 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2547 ) 2548 schema_filter = ( 2549 self._create_component_from_model( 2550 model.schema_filter, config=config, parameters=model.parameters or {} 2551 ) 2552 if model.schema_filter is not None 2553 else None 2554 ) 2555 2556 return DynamicSchemaLoader( 2557 retriever=retriever, 2558 config=config, 2559 schema_transformations=schema_transformations, 2560 schema_filter=schema_filter, 2561 schema_type_identifier=schema_type_identifier, 2562 parameters=model.parameters or {}, 2563 ) 2564 2565 @staticmethod 2566 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2567 return JsonDecoder(parameters={}) 2568 2569 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2570 return CompositeRawDecoder( 2571 parser=ModelToComponentFactory._get_parser(model, config), 2572 stream_response=False if self._emit_connector_builder_messages else True, 2573 ) 2574 2575 def create_jsonl_decoder( 2576 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2577 ) -> Decoder: 2578 return CompositeRawDecoder( 2579 parser=ModelToComponentFactory._get_parser(model, config), 2580 stream_response=False if self._emit_connector_builder_messages else True, 2581 ) 2582 2583 def create_gzip_decoder( 2584 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2585 ) -> Decoder: 2586 _compressed_response_types = { 2587 "gzip", 2588 "x-gzip", 2589 "gzip, deflate", 2590 "x-gzip, deflate", 2591 "application/zip", 2592 "application/gzip", 2593 "application/x-gzip", 2594 "application/x-zip-compressed", 2595 } 2596 2597 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2598 2599 if self._emit_connector_builder_messages: 2600 # This is very surprising but if the response is not streamed, 2601 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2602 # which uses urllib3 directly and does not uncompress the data. 2603 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2604 2605 return CompositeRawDecoder.by_headers( 2606 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2607 stream_response=True, 2608 fallback_parser=gzip_parser.inner_parser, 2609 ) 2610 2611 @staticmethod 2612 def create_incrementing_count_cursor( 2613 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2614 ) -> DatetimeBasedCursor: 2615 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2616 # we still parse models into components. The issue is that there's no runtime implementation of a 2617 # IncrementingCountCursor. 2618 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2619 return DatetimeBasedCursor( 2620 cursor_field=model.cursor_field, 2621 datetime_format="%Y-%m-%d", 2622 start_datetime="2024-12-12", 2623 config=config, 2624 parameters={}, 2625 ) 2626 2627 @staticmethod 2628 def create_iterable_decoder( 2629 model: IterableDecoderModel, config: Config, **kwargs: Any 2630 ) -> IterableDecoder: 2631 return IterableDecoder(parameters={}) 2632 2633 @staticmethod 2634 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2635 return XmlDecoder(parameters={}) 2636 2637 def create_zipfile_decoder( 2638 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2639 ) -> ZipfileDecoder: 2640 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2641 2642 @staticmethod 2643 def _get_parser(model: BaseModel, config: Config) -> Parser: 2644 if isinstance(model, JsonDecoderModel): 2645 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2646 return JsonParser() 2647 elif isinstance(model, JsonlDecoderModel): 2648 return JsonLineParser() 2649 elif isinstance(model, CsvDecoderModel): 2650 return CsvParser(encoding=model.encoding, delimiter=model.delimiter) 2651 elif isinstance(model, GzipDecoderModel): 2652 return GzipParser( 2653 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2654 ) 2655 elif isinstance( 2656 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2657 ): 2658 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2659 2660 raise ValueError(f"Unknown decoder type {model}") 2661 2662 @staticmethod 2663 def create_json_file_schema_loader( 2664 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2665 ) -> JsonFileSchemaLoader: 2666 return JsonFileSchemaLoader( 2667 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2668 ) 2669 2670 @staticmethod 2671 def create_jwt_authenticator( 2672 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2673 ) -> JwtAuthenticator: 2674 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2675 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2676 return JwtAuthenticator( 2677 config=config, 2678 parameters=model.parameters or {}, 2679 algorithm=JwtAlgorithm(model.algorithm.value), 2680 secret_key=model.secret_key, 2681 base64_encode_secret_key=model.base64_encode_secret_key, 2682 token_duration=model.token_duration, 2683 header_prefix=model.header_prefix, 2684 kid=jwt_headers.kid, 2685 typ=jwt_headers.typ, 2686 cty=jwt_headers.cty, 2687 iss=jwt_payload.iss, 2688 sub=jwt_payload.sub, 2689 aud=jwt_payload.aud, 2690 additional_jwt_headers=model.additional_jwt_headers, 2691 additional_jwt_payload=model.additional_jwt_payload, 2692 ) 2693 2694 def create_list_partition_router( 2695 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2696 ) -> ListPartitionRouter: 2697 request_option = ( 2698 self._create_component_from_model(model.request_option, config) 2699 if model.request_option 2700 else None 2701 ) 2702 return ListPartitionRouter( 2703 cursor_field=model.cursor_field, 2704 request_option=request_option, 2705 values=model.values, 2706 config=config, 2707 parameters=model.parameters or {}, 2708 ) 2709 2710 @staticmethod 2711 def create_min_max_datetime( 2712 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2713 ) -> MinMaxDatetime: 2714 return MinMaxDatetime( 2715 datetime=model.datetime, 2716 datetime_format=model.datetime_format or "", 2717 max_datetime=model.max_datetime or "", 2718 min_datetime=model.min_datetime or "", 2719 parameters=model.parameters or {}, 2720 ) 2721 2722 @staticmethod 2723 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2724 return NoAuth(parameters=model.parameters or {}) 2725 2726 @staticmethod 2727 def create_no_pagination( 2728 model: NoPaginationModel, config: Config, **kwargs: Any 2729 ) -> NoPagination: 2730 return NoPagination(parameters={}) 2731 2732 def create_oauth_authenticator( 2733 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2734 ) -> DeclarativeOauth2Authenticator: 2735 profile_assertion = ( 2736 self._create_component_from_model(model.profile_assertion, config=config) 2737 if model.profile_assertion 2738 else None 2739 ) 2740 2741 if model.refresh_token_updater: 2742 # ignore type error because fixing it would have a lot of dependencies, revisit later 2743 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2744 config, 2745 InterpolatedString.create( 2746 model.token_refresh_endpoint, # type: ignore 2747 parameters=model.parameters or {}, 2748 ).eval(config), 2749 access_token_name=InterpolatedString.create( 2750 model.access_token_name or "access_token", parameters=model.parameters or {} 2751 ).eval(config), 2752 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2753 expires_in_name=InterpolatedString.create( 2754 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2755 ).eval(config), 2756 client_id_name=InterpolatedString.create( 2757 model.client_id_name or "client_id", parameters=model.parameters or {} 2758 ).eval(config), 2759 client_id=InterpolatedString.create( 2760 model.client_id, parameters=model.parameters or {} 2761 ).eval(config) 2762 if model.client_id 2763 else model.client_id, 2764 client_secret_name=InterpolatedString.create( 2765 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2766 ).eval(config), 2767 client_secret=InterpolatedString.create( 2768 model.client_secret, parameters=model.parameters or {} 2769 ).eval(config) 2770 if model.client_secret 2771 else model.client_secret, 2772 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2773 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2774 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2775 grant_type_name=InterpolatedString.create( 2776 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2777 ).eval(config), 2778 grant_type=InterpolatedString.create( 2779 model.grant_type or "refresh_token", parameters=model.parameters or {} 2780 ).eval(config), 2781 refresh_request_body=InterpolatedMapping( 2782 model.refresh_request_body or {}, parameters=model.parameters or {} 2783 ).eval(config), 2784 refresh_request_headers=InterpolatedMapping( 2785 model.refresh_request_headers or {}, parameters=model.parameters or {} 2786 ).eval(config), 2787 scopes=model.scopes, 2788 token_expiry_date_format=model.token_expiry_date_format, 2789 message_repository=self._message_repository, 2790 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2791 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2792 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2793 ) 2794 # ignore type error because fixing it would have a lot of dependencies, revisit later 2795 return DeclarativeOauth2Authenticator( # type: ignore 2796 access_token_name=model.access_token_name or "access_token", 2797 access_token_value=model.access_token_value, 2798 client_id_name=model.client_id_name or "client_id", 2799 client_id=model.client_id, 2800 client_secret_name=model.client_secret_name or "client_secret", 2801 client_secret=model.client_secret, 2802 expires_in_name=model.expires_in_name or "expires_in", 2803 grant_type_name=model.grant_type_name or "grant_type", 2804 grant_type=model.grant_type or "refresh_token", 2805 refresh_request_body=model.refresh_request_body, 2806 refresh_request_headers=model.refresh_request_headers, 2807 refresh_token_name=model.refresh_token_name or "refresh_token", 2808 refresh_token=model.refresh_token, 2809 scopes=model.scopes, 2810 token_expiry_date=model.token_expiry_date, 2811 token_expiry_date_format=model.token_expiry_date_format, 2812 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2813 token_refresh_endpoint=model.token_refresh_endpoint, 2814 config=config, 2815 parameters=model.parameters or {}, 2816 message_repository=self._message_repository, 2817 profile_assertion=profile_assertion, 2818 use_profile_assertion=model.use_profile_assertion, 2819 ) 2820 2821 def create_offset_increment( 2822 self, 2823 model: OffsetIncrementModel, 2824 config: Config, 2825 decoder: Decoder, 2826 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2827 **kwargs: Any, 2828 ) -> OffsetIncrement: 2829 if isinstance(decoder, PaginationDecoderDecorator): 2830 inner_decoder = decoder.decoder 2831 else: 2832 inner_decoder = decoder 2833 decoder = PaginationDecoderDecorator(decoder=decoder) 2834 2835 if self._is_supported_decoder_for_pagination(inner_decoder): 2836 decoder_to_use = decoder 2837 else: 2838 raise ValueError( 2839 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2840 ) 2841 2842 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2843 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2844 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2845 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2846 # When we have more time to investigate we can look into reusing the same component. 2847 extractor = ( 2848 self._create_component_from_model( 2849 model=extractor_model, config=config, decoder=decoder_to_use 2850 ) 2851 if extractor_model 2852 else None 2853 ) 2854 2855 return OffsetIncrement( 2856 page_size=model.page_size, 2857 config=config, 2858 decoder=decoder_to_use, 2859 extractor=extractor, 2860 inject_on_first_request=model.inject_on_first_request or False, 2861 parameters=model.parameters or {}, 2862 ) 2863 2864 @staticmethod 2865 def create_page_increment( 2866 model: PageIncrementModel, config: Config, **kwargs: Any 2867 ) -> PageIncrement: 2868 return PageIncrement( 2869 page_size=model.page_size, 2870 config=config, 2871 start_from_page=model.start_from_page or 0, 2872 inject_on_first_request=model.inject_on_first_request or False, 2873 parameters=model.parameters or {}, 2874 ) 2875 2876 def create_parent_stream_config( 2877 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2878 ) -> ParentStreamConfig: 2879 declarative_stream = self._create_component_from_model( 2880 model.stream, config=config, **kwargs 2881 ) 2882 request_option = ( 2883 self._create_component_from_model(model.request_option, config=config) 2884 if model.request_option 2885 else None 2886 ) 2887 2888 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2889 raise ValueError( 2890 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2891 ) 2892 2893 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2894 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2895 ) 2896 2897 return ParentStreamConfig( 2898 parent_key=model.parent_key, 2899 request_option=request_option, 2900 stream=declarative_stream, 2901 partition_field=model.partition_field, 2902 config=config, 2903 incremental_dependency=model.incremental_dependency or False, 2904 parameters=model.parameters or {}, 2905 extra_fields=model.extra_fields, 2906 lazy_read_pointer=model_lazy_read_pointer, 2907 ) 2908 2909 def create_properties_from_endpoint( 2910 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2911 ) -> PropertiesFromEndpoint: 2912 retriever = self._create_component_from_model( 2913 model=model.retriever, 2914 config=config, 2915 name="dynamic_properties", 2916 primary_key=None, 2917 stream_slicer=None, 2918 transformations=[], 2919 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2920 ) 2921 return PropertiesFromEndpoint( 2922 property_field_path=model.property_field_path, 2923 retriever=retriever, 2924 config=config, 2925 parameters=model.parameters or {}, 2926 ) 2927 2928 def create_property_chunking( 2929 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2930 ) -> PropertyChunking: 2931 record_merge_strategy = ( 2932 self._create_component_from_model( 2933 model=model.record_merge_strategy, config=config, **kwargs 2934 ) 2935 if model.record_merge_strategy 2936 else None 2937 ) 2938 2939 property_limit_type: PropertyLimitType 2940 match model.property_limit_type: 2941 case PropertyLimitTypeModel.property_count: 2942 property_limit_type = PropertyLimitType.property_count 2943 case PropertyLimitTypeModel.characters: 2944 property_limit_type = PropertyLimitType.characters 2945 case _: 2946 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2947 2948 return PropertyChunking( 2949 property_limit_type=property_limit_type, 2950 property_limit=model.property_limit, 2951 record_merge_strategy=record_merge_strategy, 2952 config=config, 2953 parameters=model.parameters or {}, 2954 ) 2955 2956 def create_query_properties( 2957 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2958 ) -> QueryProperties: 2959 if isinstance(model.property_list, list): 2960 property_list = model.property_list 2961 else: 2962 property_list = self._create_component_from_model( 2963 model=model.property_list, config=config, **kwargs 2964 ) 2965 2966 property_chunking = ( 2967 self._create_component_from_model( 2968 model=model.property_chunking, config=config, **kwargs 2969 ) 2970 if model.property_chunking 2971 else None 2972 ) 2973 2974 return QueryProperties( 2975 property_list=property_list, 2976 always_include_properties=model.always_include_properties, 2977 property_chunking=property_chunking, 2978 config=config, 2979 parameters=model.parameters or {}, 2980 ) 2981 2982 @staticmethod 2983 def create_record_filter( 2984 model: RecordFilterModel, config: Config, **kwargs: Any 2985 ) -> RecordFilter: 2986 return RecordFilter( 2987 condition=model.condition or "", config=config, parameters=model.parameters or {} 2988 ) 2989 2990 @staticmethod 2991 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 2992 return RequestPath(parameters={}) 2993 2994 @staticmethod 2995 def create_request_option( 2996 model: RequestOptionModel, config: Config, **kwargs: Any 2997 ) -> RequestOption: 2998 inject_into = RequestOptionType(model.inject_into.value) 2999 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3000 [ 3001 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3002 for segment in model.field_path 3003 ] 3004 if model.field_path 3005 else None 3006 ) 3007 field_name = ( 3008 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3009 if model.field_name 3010 else None 3011 ) 3012 return RequestOption( 3013 field_name=field_name, 3014 field_path=field_path, 3015 inject_into=inject_into, 3016 parameters=kwargs.get("parameters", {}), 3017 ) 3018 3019 def create_record_selector( 3020 self, 3021 model: RecordSelectorModel, 3022 config: Config, 3023 *, 3024 name: str, 3025 transformations: List[RecordTransformation] | None = None, 3026 decoder: Decoder | None = None, 3027 client_side_incremental_sync: Dict[str, Any] | None = None, 3028 file_uploader: Optional[DefaultFileUploader] = None, 3029 **kwargs: Any, 3030 ) -> RecordSelector: 3031 extractor = self._create_component_from_model( 3032 model=model.extractor, decoder=decoder, config=config 3033 ) 3034 record_filter = ( 3035 self._create_component_from_model(model.record_filter, config=config) 3036 if model.record_filter 3037 else None 3038 ) 3039 3040 transform_before_filtering = ( 3041 False if model.transform_before_filtering is None else model.transform_before_filtering 3042 ) 3043 if client_side_incremental_sync: 3044 record_filter = ClientSideIncrementalRecordFilterDecorator( 3045 config=config, 3046 parameters=model.parameters, 3047 condition=model.record_filter.condition 3048 if (model.record_filter and hasattr(model.record_filter, "condition")) 3049 else None, 3050 **client_side_incremental_sync, 3051 ) 3052 transform_before_filtering = ( 3053 True 3054 if model.transform_before_filtering is None 3055 else model.transform_before_filtering 3056 ) 3057 3058 if model.schema_normalization is None: 3059 # default to no schema normalization if not set 3060 model.schema_normalization = SchemaNormalizationModel.None_ 3061 3062 schema_normalization = ( 3063 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3064 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3065 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3066 ) 3067 3068 return RecordSelector( 3069 extractor=extractor, 3070 name=name, 3071 config=config, 3072 record_filter=record_filter, 3073 transformations=transformations or [], 3074 file_uploader=file_uploader, 3075 schema_normalization=schema_normalization, 3076 parameters=model.parameters or {}, 3077 transform_before_filtering=transform_before_filtering, 3078 ) 3079 3080 @staticmethod 3081 def create_remove_fields( 3082 model: RemoveFieldsModel, config: Config, **kwargs: Any 3083 ) -> RemoveFields: 3084 return RemoveFields( 3085 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3086 ) 3087 3088 def create_selective_authenticator( 3089 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3090 ) -> DeclarativeAuthenticator: 3091 authenticators = { 3092 name: self._create_component_from_model(model=auth, config=config) 3093 for name, auth in model.authenticators.items() 3094 } 3095 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3096 return SelectiveAuthenticator( # type: ignore[abstract] 3097 config=config, 3098 authenticators=authenticators, 3099 authenticator_selection_path=model.authenticator_selection_path, 3100 **kwargs, 3101 ) 3102 3103 @staticmethod 3104 def create_legacy_session_token_authenticator( 3105 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3106 ) -> LegacySessionTokenAuthenticator: 3107 return LegacySessionTokenAuthenticator( 3108 api_url=url_base, 3109 header=model.header, 3110 login_url=model.login_url, 3111 password=model.password or "", 3112 session_token=model.session_token or "", 3113 session_token_response_key=model.session_token_response_key or "", 3114 username=model.username or "", 3115 validate_session_url=model.validate_session_url, 3116 config=config, 3117 parameters=model.parameters or {}, 3118 ) 3119 3120 def create_simple_retriever( 3121 self, 3122 model: SimpleRetrieverModel, 3123 config: Config, 3124 *, 3125 name: str, 3126 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3127 stream_slicer: Optional[StreamSlicer], 3128 request_options_provider: Optional[RequestOptionsProvider] = None, 3129 stop_condition_on_cursor: bool = False, 3130 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3131 transformations: List[RecordTransformation], 3132 file_uploader: Optional[DefaultFileUploader] = None, 3133 incremental_sync: Optional[ 3134 Union[ 3135 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 3136 ] 3137 ] = None, 3138 use_cache: Optional[bool] = None, 3139 log_formatter: Optional[Callable[[Response], Any]] = None, 3140 **kwargs: Any, 3141 ) -> SimpleRetriever: 3142 def _get_url() -> str: 3143 """ 3144 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3145 This is needed because the URL is not set until the requester is created. 3146 """ 3147 3148 _url = ( 3149 model.requester.url 3150 if hasattr(model.requester, "url") and model.requester.url is not None 3151 else requester.get_url() 3152 ) 3153 _url_base = ( 3154 model.requester.url_base 3155 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3156 else requester.get_url_base() 3157 ) 3158 3159 return _url or _url_base 3160 3161 decoder = ( 3162 self._create_component_from_model(model=model.decoder, config=config) 3163 if model.decoder 3164 else JsonDecoder(parameters={}) 3165 ) 3166 record_selector = self._create_component_from_model( 3167 model=model.record_selector, 3168 name=name, 3169 config=config, 3170 decoder=decoder, 3171 transformations=transformations, 3172 client_side_incremental_sync=client_side_incremental_sync, 3173 file_uploader=file_uploader, 3174 ) 3175 3176 query_properties: Optional[QueryProperties] = None 3177 query_properties_key: Optional[str] = None 3178 if self._query_properties_in_request_parameters(model.requester): 3179 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3180 # places instead of default to request_parameters which isn't clearly documented 3181 if ( 3182 hasattr(model.requester, "fetch_properties_from_endpoint") 3183 and model.requester.fetch_properties_from_endpoint 3184 ): 3185 raise ValueError( 3186 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3187 ) 3188 3189 query_properties_definitions = [] 3190 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3191 if isinstance(request_parameter, QueryPropertiesModel): 3192 query_properties_key = key 3193 query_properties_definitions.append(request_parameter) 3194 3195 if len(query_properties_definitions) > 1: 3196 raise ValueError( 3197 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3198 ) 3199 3200 if len(query_properties_definitions) == 1: 3201 query_properties = self._create_component_from_model( 3202 model=query_properties_definitions[0], config=config 3203 ) 3204 elif ( 3205 hasattr(model.requester, "fetch_properties_from_endpoint") 3206 and model.requester.fetch_properties_from_endpoint 3207 ): 3208 query_properties_definition = QueryPropertiesModel( 3209 type="QueryProperties", 3210 property_list=model.requester.fetch_properties_from_endpoint, 3211 always_include_properties=None, 3212 property_chunking=None, 3213 ) # type: ignore # $parameters has a default value 3214 3215 query_properties = self.create_query_properties( 3216 model=query_properties_definition, 3217 config=config, 3218 ) 3219 3220 requester = self._create_component_from_model( 3221 model=model.requester, 3222 decoder=decoder, 3223 name=name, 3224 query_properties_key=query_properties_key, 3225 use_cache=use_cache, 3226 config=config, 3227 ) 3228 3229 # Define cursor only if per partition or common incremental support is needed 3230 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3231 3232 if ( 3233 not isinstance(stream_slicer, DatetimeBasedCursor) 3234 or type(stream_slicer) is not DatetimeBasedCursor 3235 ): 3236 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3237 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3238 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3239 # request_options_provider 3240 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3241 elif not request_options_provider: 3242 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3243 3244 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3245 3246 cursor_used_for_stop_condition = cursor if stop_condition_on_cursor else None 3247 paginator = ( 3248 self._create_component_from_model( 3249 model=model.paginator, 3250 config=config, 3251 url_base=_get_url(), 3252 extractor_model=model.record_selector.extractor, 3253 decoder=decoder, 3254 cursor_used_for_stop_condition=cursor_used_for_stop_condition, 3255 ) 3256 if model.paginator 3257 else NoPagination(parameters={}) 3258 ) 3259 3260 ignore_stream_slicer_parameters_on_paginated_requests = ( 3261 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3262 ) 3263 3264 if ( 3265 model.partition_router 3266 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3267 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3268 and any( 3269 parent_stream_config.lazy_read_pointer 3270 for parent_stream_config in model.partition_router.parent_stream_configs 3271 ) 3272 ): 3273 if incremental_sync: 3274 if incremental_sync.type != "DatetimeBasedCursor": 3275 raise ValueError( 3276 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3277 ) 3278 3279 elif incremental_sync.step or incremental_sync.cursor_granularity: 3280 raise ValueError( 3281 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3282 ) 3283 3284 if model.decoder and model.decoder.type != "JsonDecoder": 3285 raise ValueError( 3286 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3287 ) 3288 3289 return LazySimpleRetriever( 3290 name=name, 3291 paginator=paginator, 3292 primary_key=primary_key, 3293 requester=requester, 3294 record_selector=record_selector, 3295 stream_slicer=stream_slicer, 3296 request_option_provider=request_options_provider, 3297 cursor=cursor, 3298 config=config, 3299 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3300 parameters=model.parameters or {}, 3301 ) 3302 3303 if self._limit_slices_fetched or self._emit_connector_builder_messages: 3304 return SimpleRetrieverTestReadDecorator( 3305 name=name, 3306 paginator=paginator, 3307 primary_key=primary_key, 3308 requester=requester, 3309 record_selector=record_selector, 3310 stream_slicer=stream_slicer, 3311 request_option_provider=request_options_provider, 3312 cursor=cursor, 3313 config=config, 3314 maximum_number_of_slices=self._limit_slices_fetched or 5, 3315 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3316 log_formatter=log_formatter, 3317 parameters=model.parameters or {}, 3318 ) 3319 return SimpleRetriever( 3320 name=name, 3321 paginator=paginator, 3322 primary_key=primary_key, 3323 requester=requester, 3324 record_selector=record_selector, 3325 stream_slicer=stream_slicer, 3326 request_option_provider=request_options_provider, 3327 cursor=cursor, 3328 config=config, 3329 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3330 additional_query_properties=query_properties, 3331 parameters=model.parameters or {}, 3332 ) 3333 3334 @staticmethod 3335 def _query_properties_in_request_parameters( 3336 requester: Union[HttpRequesterModel, CustomRequesterModel], 3337 ) -> bool: 3338 if not hasattr(requester, "request_parameters"): 3339 return False 3340 request_parameters = requester.request_parameters 3341 if request_parameters and isinstance(request_parameters, Mapping): 3342 for request_parameter in request_parameters.values(): 3343 if isinstance(request_parameter, QueryPropertiesModel): 3344 return True 3345 return False 3346 3347 @staticmethod 3348 def _remove_query_properties( 3349 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3350 ) -> Mapping[str, str]: 3351 return { 3352 parameter_field: request_parameter 3353 for parameter_field, request_parameter in request_parameters.items() 3354 if not isinstance(request_parameter, QueryPropertiesModel) 3355 } 3356 3357 def create_state_delegating_stream( 3358 self, 3359 model: StateDelegatingStreamModel, 3360 config: Config, 3361 has_parent_state: Optional[bool] = None, 3362 **kwargs: Any, 3363 ) -> DeclarativeStream: 3364 if ( 3365 model.full_refresh_stream.name != model.name 3366 or model.name != model.incremental_stream.name 3367 ): 3368 raise ValueError( 3369 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3370 ) 3371 3372 stream_model = ( 3373 model.incremental_stream 3374 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3375 else model.full_refresh_stream 3376 ) 3377 3378 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description 3379 3380 def _create_async_job_status_mapping( 3381 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3382 ) -> Mapping[str, AsyncJobStatus]: 3383 api_status_to_cdk_status = {} 3384 for cdk_status, api_statuses in model.dict().items(): 3385 if cdk_status == "type": 3386 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3387 continue 3388 3389 for status in api_statuses: 3390 if status in api_status_to_cdk_status: 3391 raise ValueError( 3392 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3393 ) 3394 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3395 return api_status_to_cdk_status 3396 3397 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3398 match status: 3399 case "running": 3400 return AsyncJobStatus.RUNNING 3401 case "completed": 3402 return AsyncJobStatus.COMPLETED 3403 case "failed": 3404 return AsyncJobStatus.FAILED 3405 case "timeout": 3406 return AsyncJobStatus.TIMED_OUT 3407 case _: 3408 raise ValueError(f"Unsupported CDK status {status}") 3409 3410 def create_async_retriever( 3411 self, 3412 model: AsyncRetrieverModel, 3413 config: Config, 3414 *, 3415 name: str, 3416 primary_key: Optional[ 3417 Union[str, List[str], List[List[str]]] 3418 ], # this seems to be needed to match create_simple_retriever 3419 stream_slicer: Optional[StreamSlicer], 3420 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3421 transformations: List[RecordTransformation], 3422 **kwargs: Any, 3423 ) -> AsyncRetriever: 3424 def _get_download_retriever() -> SimpleRetrieverTestReadDecorator | SimpleRetriever: 3425 record_selector = RecordSelector( 3426 extractor=download_extractor, 3427 name=name, 3428 record_filter=None, 3429 transformations=transformations, 3430 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3431 config=config, 3432 parameters={}, 3433 ) 3434 paginator = ( 3435 self._create_component_from_model( 3436 model=model.download_paginator, 3437 decoder=decoder, 3438 config=config, 3439 url_base="", 3440 ) 3441 if model.download_paginator 3442 else NoPagination(parameters={}) 3443 ) 3444 maximum_number_of_slices = self._limit_slices_fetched or 5 3445 3446 if self._limit_slices_fetched or self._emit_connector_builder_messages: 3447 return SimpleRetrieverTestReadDecorator( 3448 requester=download_requester, 3449 record_selector=record_selector, 3450 primary_key=None, 3451 name=job_download_components_name, 3452 paginator=paginator, 3453 config=config, 3454 parameters={}, 3455 maximum_number_of_slices=maximum_number_of_slices, 3456 ) 3457 3458 return SimpleRetriever( 3459 requester=download_requester, 3460 record_selector=record_selector, 3461 primary_key=None, 3462 name=job_download_components_name, 3463 paginator=paginator, 3464 config=config, 3465 parameters={}, 3466 ) 3467 3468 def _get_job_timeout() -> datetime.timedelta: 3469 user_defined_timeout: Optional[int] = ( 3470 int( 3471 InterpolatedString.create( 3472 str(model.polling_job_timeout), 3473 parameters={}, 3474 ).eval(config) 3475 ) 3476 if model.polling_job_timeout 3477 else None 3478 ) 3479 3480 # check for user defined timeout during the test read or 15 minutes 3481 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3482 # default value for non-connector builder is 60 minutes. 3483 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3484 3485 return ( 3486 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3487 ) 3488 3489 decoder = ( 3490 self._create_component_from_model(model=model.decoder, config=config) 3491 if model.decoder 3492 else JsonDecoder(parameters={}) 3493 ) 3494 record_selector = self._create_component_from_model( 3495 model=model.record_selector, 3496 config=config, 3497 decoder=decoder, 3498 name=name, 3499 transformations=transformations, 3500 client_side_incremental_sync=client_side_incremental_sync, 3501 ) 3502 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3503 creation_requester = self._create_component_from_model( 3504 model=model.creation_requester, 3505 decoder=decoder, 3506 config=config, 3507 name=f"job creation - {name}", 3508 ) 3509 polling_requester = self._create_component_from_model( 3510 model=model.polling_requester, 3511 decoder=decoder, 3512 config=config, 3513 name=f"job polling - {name}", 3514 ) 3515 job_download_components_name = f"job download - {name}" 3516 download_decoder = ( 3517 self._create_component_from_model(model=model.download_decoder, config=config) 3518 if model.download_decoder 3519 else JsonDecoder(parameters={}) 3520 ) 3521 download_extractor = ( 3522 self._create_component_from_model( 3523 model=model.download_extractor, 3524 config=config, 3525 decoder=download_decoder, 3526 parameters=model.parameters, 3527 ) 3528 if model.download_extractor 3529 else DpathExtractor( 3530 [], 3531 config=config, 3532 decoder=download_decoder, 3533 parameters=model.parameters or {}, 3534 ) 3535 ) 3536 download_requester = self._create_component_from_model( 3537 model=model.download_requester, 3538 decoder=download_decoder, 3539 config=config, 3540 name=job_download_components_name, 3541 ) 3542 download_retriever = _get_download_retriever() 3543 abort_requester = ( 3544 self._create_component_from_model( 3545 model=model.abort_requester, 3546 decoder=decoder, 3547 config=config, 3548 name=f"job abort - {name}", 3549 ) 3550 if model.abort_requester 3551 else None 3552 ) 3553 delete_requester = ( 3554 self._create_component_from_model( 3555 model=model.delete_requester, 3556 decoder=decoder, 3557 config=config, 3558 name=f"job delete - {name}", 3559 ) 3560 if model.delete_requester 3561 else None 3562 ) 3563 download_target_requester = ( 3564 self._create_component_from_model( 3565 model=model.download_target_requester, 3566 decoder=decoder, 3567 config=config, 3568 name=f"job extract_url - {name}", 3569 ) 3570 if model.download_target_requester 3571 else None 3572 ) 3573 status_extractor = self._create_component_from_model( 3574 model=model.status_extractor, decoder=decoder, config=config, name=name 3575 ) 3576 download_target_extractor = self._create_component_from_model( 3577 model=model.download_target_extractor, 3578 decoder=decoder, 3579 config=config, 3580 name=name, 3581 ) 3582 3583 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3584 creation_requester=creation_requester, 3585 polling_requester=polling_requester, 3586 download_retriever=download_retriever, 3587 download_target_requester=download_target_requester, 3588 abort_requester=abort_requester, 3589 delete_requester=delete_requester, 3590 status_extractor=status_extractor, 3591 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3592 download_target_extractor=download_target_extractor, 3593 job_timeout=_get_job_timeout(), 3594 ) 3595 3596 async_job_partition_router = AsyncJobPartitionRouter( 3597 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3598 job_repository, 3599 stream_slices, 3600 self._job_tracker, 3601 self._message_repository, 3602 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3603 has_bulk_parent=False, 3604 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3605 # `None` == default retry is set to 3 attempts, under the hood. 3606 job_max_retry=1 if self._emit_connector_builder_messages else None, 3607 ), 3608 stream_slicer=stream_slicer, 3609 config=config, 3610 parameters=model.parameters or {}, 3611 ) 3612 3613 return AsyncRetriever( 3614 record_selector=record_selector, 3615 stream_slicer=async_job_partition_router, 3616 config=config, 3617 parameters=model.parameters or {}, 3618 ) 3619 3620 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3621 config_migrations = [ 3622 self._create_component_from_model(migration, config) 3623 for migration in ( 3624 model.config_normalization_rules.config_migrations 3625 if ( 3626 model.config_normalization_rules 3627 and model.config_normalization_rules.config_migrations 3628 ) 3629 else [] 3630 ) 3631 ] 3632 config_transformations = [ 3633 self._create_component_from_model(transformation, config) 3634 for transformation in ( 3635 model.config_normalization_rules.transformations 3636 if ( 3637 model.config_normalization_rules 3638 and model.config_normalization_rules.transformations 3639 ) 3640 else [] 3641 ) 3642 ] 3643 config_validations = [ 3644 self._create_component_from_model(validation, config) 3645 for validation in ( 3646 model.config_normalization_rules.validations 3647 if ( 3648 model.config_normalization_rules 3649 and model.config_normalization_rules.validations 3650 ) 3651 else [] 3652 ) 3653 ] 3654 3655 return Spec( 3656 connection_specification=model.connection_specification, 3657 documentation_url=model.documentation_url, 3658 advanced_auth=model.advanced_auth, 3659 parameters={}, 3660 config_migrations=config_migrations, 3661 config_transformations=config_transformations, 3662 config_validations=config_validations, 3663 ) 3664 3665 def create_substream_partition_router( 3666 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3667 ) -> SubstreamPartitionRouter: 3668 parent_stream_configs = [] 3669 if model.parent_stream_configs: 3670 parent_stream_configs.extend( 3671 [ 3672 self._create_message_repository_substream_wrapper( 3673 model=parent_stream_config, config=config, **kwargs 3674 ) 3675 for parent_stream_config in model.parent_stream_configs 3676 ] 3677 ) 3678 3679 return SubstreamPartitionRouter( 3680 parent_stream_configs=parent_stream_configs, 3681 parameters=model.parameters or {}, 3682 config=config, 3683 ) 3684 3685 def _create_message_repository_substream_wrapper( 3686 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 3687 ) -> Any: 3688 substream_factory = ModelToComponentFactory( 3689 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3690 limit_slices_fetched=self._limit_slices_fetched, 3691 emit_connector_builder_messages=self._emit_connector_builder_messages, 3692 disable_retries=self._disable_retries, 3693 disable_cache=self._disable_cache, 3694 message_repository=LogAppenderMessageRepositoryDecorator( 3695 {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}}, 3696 self._message_repository, 3697 self._evaluate_log_level(self._emit_connector_builder_messages), 3698 ), 3699 ) 3700 3701 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3702 has_parent_state = bool( 3703 self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None) 3704 if model.incremental_dependency 3705 else False 3706 ) 3707 return substream_factory._create_component_from_model( 3708 model=model, config=config, has_parent_state=has_parent_state, **kwargs 3709 ) 3710 3711 @staticmethod 3712 def create_wait_time_from_header( 3713 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3714 ) -> WaitTimeFromHeaderBackoffStrategy: 3715 return WaitTimeFromHeaderBackoffStrategy( 3716 header=model.header, 3717 parameters=model.parameters or {}, 3718 config=config, 3719 regex=model.regex, 3720 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3721 if model.max_waiting_time_in_seconds is not None 3722 else None, 3723 ) 3724 3725 @staticmethod 3726 def create_wait_until_time_from_header( 3727 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3728 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3729 return WaitUntilTimeFromHeaderBackoffStrategy( 3730 header=model.header, 3731 parameters=model.parameters or {}, 3732 config=config, 3733 min_wait=model.min_wait, 3734 regex=model.regex, 3735 ) 3736 3737 def get_message_repository(self) -> MessageRepository: 3738 return self._message_repository 3739 3740 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3741 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3742 3743 @staticmethod 3744 def create_components_mapping_definition( 3745 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3746 ) -> ComponentMappingDefinition: 3747 interpolated_value = InterpolatedString.create( 3748 model.value, parameters=model.parameters or {} 3749 ) 3750 field_path = [ 3751 InterpolatedString.create(path, parameters=model.parameters or {}) 3752 for path in model.field_path 3753 ] 3754 return ComponentMappingDefinition( 3755 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3756 value=interpolated_value, 3757 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3758 create_or_update=model.create_or_update, 3759 parameters=model.parameters or {}, 3760 ) 3761 3762 def create_http_components_resolver( 3763 self, model: HttpComponentsResolverModel, config: Config 3764 ) -> Any: 3765 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3766 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3767 3768 retriever = self._create_component_from_model( 3769 model=model.retriever, 3770 config=config, 3771 name="", 3772 primary_key=None, 3773 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3774 transformations=[], 3775 ) 3776 3777 components_mapping = [ 3778 self._create_component_from_model( 3779 model=components_mapping_definition_model, 3780 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3781 components_mapping_definition_model.value_type 3782 ), 3783 config=config, 3784 ) 3785 for components_mapping_definition_model in model.components_mapping 3786 ] 3787 3788 return HttpComponentsResolver( 3789 retriever=retriever, 3790 config=config, 3791 components_mapping=components_mapping, 3792 parameters=model.parameters or {}, 3793 ) 3794 3795 @staticmethod 3796 def create_stream_config( 3797 model: StreamConfigModel, config: Config, **kwargs: Any 3798 ) -> StreamConfig: 3799 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3800 [x for x in model.configs_pointer] if model.configs_pointer else [] 3801 ) 3802 3803 return StreamConfig( 3804 configs_pointer=model_configs_pointer, 3805 default_values=model.default_values, 3806 parameters=model.parameters or {}, 3807 ) 3808 3809 def create_config_components_resolver( 3810 self, model: ConfigComponentsResolverModel, config: Config 3811 ) -> Any: 3812 model_stream_configs = ( 3813 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3814 ) 3815 3816 stream_configs = [ 3817 self._create_component_from_model( 3818 stream_config, config=config, parameters=model.parameters or {} 3819 ) 3820 for stream_config in model_stream_configs 3821 ] 3822 3823 components_mapping = [ 3824 self._create_component_from_model( 3825 model=components_mapping_definition_model, 3826 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3827 components_mapping_definition_model.value_type 3828 ), 3829 config=config, 3830 ) 3831 for components_mapping_definition_model in model.components_mapping 3832 ] 3833 3834 return ConfigComponentsResolver( 3835 stream_configs=stream_configs, 3836 config=config, 3837 components_mapping=components_mapping, 3838 parameters=model.parameters or {}, 3839 ) 3840 3841 _UNSUPPORTED_DECODER_ERROR = ( 3842 "Specified decoder of {decoder_type} is not supported for pagination." 3843 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 3844 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 3845 ) 3846 3847 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 3848 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 3849 return True 3850 elif isinstance(decoder, CompositeRawDecoder): 3851 return self._is_supported_parser_for_pagination(decoder.parser) 3852 else: 3853 return False 3854 3855 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 3856 if isinstance(parser, JsonParser): 3857 return True 3858 elif isinstance(parser, GzipParser): 3859 return isinstance(parser.inner_parser, JsonParser) 3860 else: 3861 return False 3862 3863 def create_http_api_budget( 3864 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3865 ) -> HttpAPIBudget: 3866 policies = [ 3867 self._create_component_from_model(model=policy, config=config) 3868 for policy in model.policies 3869 ] 3870 3871 return HttpAPIBudget( 3872 policies=policies, 3873 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 3874 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 3875 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 3876 ) 3877 3878 def create_fixed_window_call_rate_policy( 3879 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 3880 ) -> FixedWindowCallRatePolicy: 3881 matchers = [ 3882 self._create_component_from_model(model=matcher, config=config) 3883 for matcher in model.matchers 3884 ] 3885 3886 # Set the initial reset timestamp to 10 days from now. 3887 # This value will be updated by the first request. 3888 return FixedWindowCallRatePolicy( 3889 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 3890 period=parse_duration(model.period), 3891 call_limit=model.call_limit, 3892 matchers=matchers, 3893 ) 3894 3895 def create_file_uploader( 3896 self, model: FileUploaderModel, config: Config, **kwargs: Any 3897 ) -> FileUploader: 3898 name = "File Uploader" 3899 requester = self._create_component_from_model( 3900 model=model.requester, 3901 config=config, 3902 name=name, 3903 **kwargs, 3904 ) 3905 download_target_extractor = self._create_component_from_model( 3906 model=model.download_target_extractor, 3907 config=config, 3908 name=name, 3909 **kwargs, 3910 ) 3911 emit_connector_builder_messages = self._emit_connector_builder_messages 3912 file_uploader = DefaultFileUploader( 3913 requester=requester, 3914 download_target_extractor=download_target_extractor, 3915 config=config, 3916 file_writer=NoopFileWriter() 3917 if emit_connector_builder_messages 3918 else LocalFileSystemFileWriter(), 3919 parameters=model.parameters or {}, 3920 filename_extractor=model.filename_extractor if model.filename_extractor else None, 3921 ) 3922 3923 return ( 3924 ConnectorBuilderFileUploader(file_uploader) 3925 if emit_connector_builder_messages 3926 else file_uploader 3927 ) 3928 3929 def create_moving_window_call_rate_policy( 3930 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 3931 ) -> MovingWindowCallRatePolicy: 3932 rates = [ 3933 self._create_component_from_model(model=rate, config=config) for rate in model.rates 3934 ] 3935 matchers = [ 3936 self._create_component_from_model(model=matcher, config=config) 3937 for matcher in model.matchers 3938 ] 3939 return MovingWindowCallRatePolicy( 3940 rates=rates, 3941 matchers=matchers, 3942 ) 3943 3944 def create_unlimited_call_rate_policy( 3945 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 3946 ) -> UnlimitedCallRatePolicy: 3947 matchers = [ 3948 self._create_component_from_model(model=matcher, config=config) 3949 for matcher in model.matchers 3950 ] 3951 3952 return UnlimitedCallRatePolicy( 3953 matchers=matchers, 3954 ) 3955 3956 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 3957 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 3958 return Rate( 3959 limit=int(interpolated_limit.eval(config=config)), 3960 interval=parse_duration(model.interval), 3961 ) 3962 3963 def create_http_request_matcher( 3964 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 3965 ) -> HttpRequestRegexMatcher: 3966 return HttpRequestRegexMatcher( 3967 method=model.method, 3968 url_base=model.url_base, 3969 url_path_pattern=model.url_path_pattern, 3970 params=model.params, 3971 headers=model.headers, 3972 ) 3973 3974 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 3975 self._api_budget = self.create_component( 3976 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 3977 ) 3978 3979 def create_grouping_partition_router( 3980 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 3981 ) -> GroupingPartitionRouter: 3982 underlying_router = self._create_component_from_model( 3983 model=model.underlying_partition_router, config=config 3984 ) 3985 if model.group_size < 1: 3986 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 3987 3988 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 3989 # because they are specific to individual partitions and cannot be aggregated or handled 3990 # when grouping, potentially leading to incorrect API calls. Any request customization 3991 # should be managed at the stream level through the requester's configuration. 3992 if isinstance(underlying_router, SubstreamPartitionRouter): 3993 if any( 3994 parent_config.request_option 3995 for parent_config in underlying_router.parent_stream_configs 3996 ): 3997 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 3998 3999 if isinstance(underlying_router, ListPartitionRouter): 4000 if underlying_router.request_option: 4001 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4002 4003 return GroupingPartitionRouter( 4004 group_size=model.group_size, 4005 underlying_partition_router=underlying_router, 4006 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4007 config=config, 4008 )
618 def __init__( 619 self, 620 limit_pages_fetched_per_slice: Optional[int] = None, 621 limit_slices_fetched: Optional[int] = None, 622 emit_connector_builder_messages: bool = False, 623 disable_retries: bool = False, 624 disable_cache: bool = False, 625 disable_resumable_full_refresh: bool = False, 626 message_repository: Optional[MessageRepository] = None, 627 connector_state_manager: Optional[ConnectorStateManager] = None, 628 max_concurrent_async_job_count: Optional[int] = None, 629 ): 630 self._init_mappings() 631 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 632 self._limit_slices_fetched = limit_slices_fetched 633 self._emit_connector_builder_messages = emit_connector_builder_messages 634 self._disable_retries = disable_retries 635 self._disable_cache = disable_cache 636 self._disable_resumable_full_refresh = disable_resumable_full_refresh 637 self._message_repository = message_repository or InMemoryMessageRepository( 638 self._evaluate_log_level(emit_connector_builder_messages) 639 ) 640 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 641 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 642 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 643 # placeholder for deprecation warnings 644 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = []
756 def create_component( 757 self, 758 model_type: Type[BaseModel], 759 component_definition: ComponentDefinition, 760 config: Config, 761 **kwargs: Any, 762 ) -> Any: 763 """ 764 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 765 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 766 creating declarative components from that model. 767 768 :param model_type: The type of declarative component that is being initialized 769 :param component_definition: The mapping that represents a declarative component 770 :param config: The connector config that is provided by the customer 771 :return: The declarative component to be used at runtime 772 """ 773 774 component_type = component_definition.get("type") 775 if component_definition.get("type") != model_type.__name__: 776 raise ValueError( 777 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 778 ) 779 780 declarative_component_model = model_type.parse_obj(component_definition) 781 782 if not isinstance(declarative_component_model, model_type): 783 raise ValueError( 784 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 785 ) 786 787 return self._create_component_from_model( 788 model=declarative_component_model, config=config, **kwargs 789 )
Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating creating declarative components from that model.
Parameters
- model_type: The type of declarative component that is being initialized
- component_definition: The mapping that represents a declarative component
- config: The connector config that is provided by the customer
Returns
The declarative component to be used at runtime
806 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 807 """ 808 Returns the deprecation warnings that were collected during the creation of components. 809 """ 810 return self._collected_deprecation_logs
Returns the deprecation warnings that were collected during the creation of components.
827 def create_config_migration( 828 self, model: ConfigMigrationModel, config: Config 829 ) -> ConfigMigration: 830 transformations: List[ConfigTransformation] = [ 831 self._create_component_from_model(transformation, config) 832 for transformation in model.transformations 833 ] 834 835 return ConfigMigration( 836 description=model.description, 837 transformations=transformations, 838 )
840 def create_config_add_fields( 841 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 842 ) -> ConfigAddFields: 843 fields = [self._create_component_from_model(field, config) for field in model.fields] 844 return ConfigAddFields( 845 fields=fields, 846 condition=model.condition or "", 847 )
896 @staticmethod 897 def create_added_field_definition( 898 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 899 ) -> AddedFieldDefinition: 900 interpolated_value = InterpolatedString.create( 901 model.value, parameters=model.parameters or {} 902 ) 903 return AddedFieldDefinition( 904 path=model.path, 905 value=interpolated_value, 906 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 907 parameters=model.parameters or {}, 908 )
910 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 911 added_field_definitions = [ 912 self._create_component_from_model( 913 model=added_field_definition_model, 914 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 915 added_field_definition_model.value_type 916 ), 917 config=config, 918 ) 919 for added_field_definition_model in model.fields 920 ] 921 return AddFields( 922 fields=added_field_definitions, 923 condition=model.condition or "", 924 parameters=model.parameters or {}, 925 )
951 def create_dpath_flatten_fields( 952 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 953 ) -> DpathFlattenFields: 954 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 955 key_transformation = ( 956 KeyTransformation( 957 config=config, 958 prefix=model.key_transformation.prefix, 959 suffix=model.key_transformation.suffix, 960 parameters=model.parameters or {}, 961 ) 962 if model.key_transformation is not None 963 else None 964 ) 965 return DpathFlattenFields( 966 config=config, 967 field_path=model_field_path, 968 delete_origin_value=model.delete_origin_value 969 if model.delete_origin_value is not None 970 else False, 971 replace_record=model.replace_record if model.replace_record is not None else False, 972 key_transformation=key_transformation, 973 parameters=model.parameters or {}, 974 )
988 def create_api_key_authenticator( 989 self, 990 model: ApiKeyAuthenticatorModel, 991 config: Config, 992 token_provider: Optional[TokenProvider] = None, 993 **kwargs: Any, 994 ) -> ApiKeyAuthenticator: 995 if model.inject_into is None and model.header is None: 996 raise ValueError( 997 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 998 ) 999 1000 if model.inject_into is not None and model.header is not None: 1001 raise ValueError( 1002 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1003 ) 1004 1005 if token_provider is not None and model.api_token != "": 1006 raise ValueError( 1007 "If token_provider is set, api_token is ignored and has to be set to empty string." 1008 ) 1009 1010 request_option = ( 1011 self._create_component_from_model( 1012 model.inject_into, config, parameters=model.parameters or {} 1013 ) 1014 if model.inject_into 1015 else RequestOption( 1016 inject_into=RequestOptionType.header, 1017 field_name=model.header or "", 1018 parameters=model.parameters or {}, 1019 ) 1020 ) 1021 1022 return ApiKeyAuthenticator( 1023 token_provider=( 1024 token_provider 1025 if token_provider is not None 1026 else InterpolatedStringTokenProvider( 1027 api_token=model.api_token or "", 1028 config=config, 1029 parameters=model.parameters or {}, 1030 ) 1031 ), 1032 request_option=request_option, 1033 config=config, 1034 parameters=model.parameters or {}, 1035 )
1037 def create_legacy_to_per_partition_state_migration( 1038 self, 1039 model: LegacyToPerPartitionStateMigrationModel, 1040 config: Mapping[str, Any], 1041 declarative_stream: DeclarativeStreamModel, 1042 ) -> LegacyToPerPartitionStateMigration: 1043 retriever = declarative_stream.retriever 1044 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1045 raise ValueError( 1046 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1047 ) 1048 partition_router = retriever.partition_router 1049 if not isinstance( 1050 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1051 ): 1052 raise ValueError( 1053 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1054 ) 1055 if not hasattr(partition_router, "parent_stream_configs"): 1056 raise ValueError( 1057 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1058 ) 1059 1060 if not hasattr(declarative_stream, "incremental_sync"): 1061 raise ValueError( 1062 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1063 ) 1064 1065 return LegacyToPerPartitionStateMigration( 1066 partition_router, # type: ignore # was already checked above 1067 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1068 config, 1069 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1070 )
1072 def create_session_token_authenticator( 1073 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1074 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1075 decoder = ( 1076 self._create_component_from_model(model=model.decoder, config=config) 1077 if model.decoder 1078 else JsonDecoder(parameters={}) 1079 ) 1080 login_requester = self._create_component_from_model( 1081 model=model.login_requester, 1082 config=config, 1083 name=f"{name}_login_requester", 1084 decoder=decoder, 1085 ) 1086 token_provider = SessionTokenProvider( 1087 login_requester=login_requester, 1088 session_token_path=model.session_token_path, 1089 expiration_duration=parse_duration(model.expiration_duration) 1090 if model.expiration_duration 1091 else None, 1092 parameters=model.parameters or {}, 1093 message_repository=self._message_repository, 1094 decoder=decoder, 1095 ) 1096 if model.request_authentication.type == "Bearer": 1097 return ModelToComponentFactory.create_bearer_authenticator( 1098 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1099 config, 1100 token_provider=token_provider, 1101 ) 1102 else: 1103 return self.create_api_key_authenticator( 1104 ApiKeyAuthenticatorModel( 1105 type="ApiKeyAuthenticator", 1106 api_token="", 1107 inject_into=model.request_authentication.inject_into, 1108 ), # type: ignore # $parameters and headers default to None 1109 config=config, 1110 token_provider=token_provider, 1111 )
1113 @staticmethod 1114 def create_basic_http_authenticator( 1115 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1116 ) -> BasicHttpAuthenticator: 1117 return BasicHttpAuthenticator( 1118 password=model.password or "", 1119 username=model.username, 1120 config=config, 1121 parameters=model.parameters or {}, 1122 )
1124 @staticmethod 1125 def create_bearer_authenticator( 1126 model: BearerAuthenticatorModel, 1127 config: Config, 1128 token_provider: Optional[TokenProvider] = None, 1129 **kwargs: Any, 1130 ) -> BearerAuthenticator: 1131 if token_provider is not None and model.api_token != "": 1132 raise ValueError( 1133 "If token_provider is set, api_token is ignored and has to be set to empty string." 1134 ) 1135 return BearerAuthenticator( 1136 token_provider=( 1137 token_provider 1138 if token_provider is not None 1139 else InterpolatedStringTokenProvider( 1140 api_token=model.api_token or "", 1141 config=config, 1142 parameters=model.parameters or {}, 1143 ) 1144 ), 1145 config=config, 1146 parameters=model.parameters or {}, 1147 )
1149 @staticmethod 1150 def create_dynamic_stream_check_config( 1151 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1152 ) -> DynamicStreamCheckConfig: 1153 return DynamicStreamCheckConfig( 1154 dynamic_stream_name=model.dynamic_stream_name, 1155 stream_count=model.stream_count or 0, 1156 )
1158 def create_check_stream( 1159 self, model: CheckStreamModel, config: Config, **kwargs: Any 1160 ) -> CheckStream: 1161 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1162 raise ValueError( 1163 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1164 ) 1165 1166 dynamic_streams_check_configs = ( 1167 [ 1168 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1169 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1170 ] 1171 if model.dynamic_streams_check_configs 1172 else [] 1173 ) 1174 1175 return CheckStream( 1176 stream_names=model.stream_names or [], 1177 dynamic_streams_check_configs=dynamic_streams_check_configs, 1178 parameters={}, 1179 )
1181 @staticmethod 1182 def create_check_dynamic_stream( 1183 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1184 ) -> CheckDynamicStream: 1185 assert model.use_check_availability is not None # for mypy 1186 1187 use_check_availability = model.use_check_availability 1188 1189 return CheckDynamicStream( 1190 stream_count=model.stream_count, 1191 use_check_availability=use_check_availability, 1192 parameters={}, 1193 )
1195 def create_composite_error_handler( 1196 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1197 ) -> CompositeErrorHandler: 1198 error_handlers = [ 1199 self._create_component_from_model(model=error_handler_model, config=config) 1200 for error_handler_model in model.error_handlers 1201 ] 1202 return CompositeErrorHandler( 1203 error_handlers=error_handlers, parameters=model.parameters or {} 1204 )
1206 @staticmethod 1207 def create_concurrency_level( 1208 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1209 ) -> ConcurrencyLevel: 1210 return ConcurrencyLevel( 1211 default_concurrency=model.default_concurrency, 1212 max_concurrency=model.max_concurrency, 1213 config=config, 1214 parameters={}, 1215 )
1217 @staticmethod 1218 def apply_stream_state_migrations( 1219 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1220 ) -> MutableMapping[str, Any]: 1221 if stream_state_migrations: 1222 for state_migration in stream_state_migrations: 1223 if state_migration.should_migrate(stream_state): 1224 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1225 stream_state = dict(state_migration.migrate(stream_state)) 1226 return stream_state
1228 def create_concurrent_cursor_from_datetime_based_cursor( 1229 self, 1230 model_type: Type[BaseModel], 1231 component_definition: ComponentDefinition, 1232 stream_name: str, 1233 stream_namespace: Optional[str], 1234 config: Config, 1235 message_repository: Optional[MessageRepository] = None, 1236 runtime_lookback_window: Optional[datetime.timedelta] = None, 1237 stream_state_migrations: Optional[List[Any]] = None, 1238 **kwargs: Any, 1239 ) -> ConcurrentCursor: 1240 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1241 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1242 # incoming state and connector_state_manager that is initialized when the component factory is created 1243 stream_state = ( 1244 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1245 if "stream_state" not in kwargs 1246 else kwargs["stream_state"] 1247 ) 1248 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1249 1250 component_type = component_definition.get("type") 1251 if component_definition.get("type") != model_type.__name__: 1252 raise ValueError( 1253 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1254 ) 1255 1256 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1257 1258 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1259 raise ValueError( 1260 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1261 ) 1262 1263 interpolated_cursor_field = InterpolatedString.create( 1264 datetime_based_cursor_model.cursor_field, 1265 parameters=datetime_based_cursor_model.parameters or {}, 1266 ) 1267 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1268 1269 interpolated_partition_field_start = InterpolatedString.create( 1270 datetime_based_cursor_model.partition_field_start or "start_time", 1271 parameters=datetime_based_cursor_model.parameters or {}, 1272 ) 1273 interpolated_partition_field_end = InterpolatedString.create( 1274 datetime_based_cursor_model.partition_field_end or "end_time", 1275 parameters=datetime_based_cursor_model.parameters or {}, 1276 ) 1277 1278 slice_boundary_fields = ( 1279 interpolated_partition_field_start.eval(config=config), 1280 interpolated_partition_field_end.eval(config=config), 1281 ) 1282 1283 datetime_format = datetime_based_cursor_model.datetime_format 1284 1285 cursor_granularity = ( 1286 parse_duration(datetime_based_cursor_model.cursor_granularity) 1287 if datetime_based_cursor_model.cursor_granularity 1288 else None 1289 ) 1290 1291 lookback_window = None 1292 interpolated_lookback_window = ( 1293 InterpolatedString.create( 1294 datetime_based_cursor_model.lookback_window, 1295 parameters=datetime_based_cursor_model.parameters or {}, 1296 ) 1297 if datetime_based_cursor_model.lookback_window 1298 else None 1299 ) 1300 if interpolated_lookback_window: 1301 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1302 if evaluated_lookback_window: 1303 lookback_window = parse_duration(evaluated_lookback_window) 1304 1305 connector_state_converter: DateTimeStreamStateConverter 1306 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1307 datetime_format=datetime_format, 1308 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1309 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1310 cursor_granularity=cursor_granularity, 1311 ) 1312 1313 # Adjusts the stream state by applying the runtime lookback window. 1314 # This is used to ensure correct state handling in case of failed partitions. 1315 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1316 if runtime_lookback_window and stream_state_value: 1317 new_stream_state = ( 1318 connector_state_converter.parse_timestamp(stream_state_value) 1319 - runtime_lookback_window 1320 ) 1321 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1322 new_stream_state 1323 ) 1324 1325 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1326 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1327 start_date_runtime_value = self.create_min_max_datetime( 1328 model=datetime_based_cursor_model.start_datetime, config=config 1329 ) 1330 else: 1331 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1332 1333 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1334 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1335 end_date_runtime_value = self.create_min_max_datetime( 1336 model=datetime_based_cursor_model.end_datetime, config=config 1337 ) 1338 else: 1339 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1340 1341 interpolated_start_date = MinMaxDatetime.create( 1342 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1343 parameters=datetime_based_cursor_model.parameters, 1344 ) 1345 interpolated_end_date = ( 1346 None 1347 if not end_date_runtime_value 1348 else MinMaxDatetime.create( 1349 end_date_runtime_value, datetime_based_cursor_model.parameters 1350 ) 1351 ) 1352 1353 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1354 if not interpolated_start_date.datetime_format: 1355 interpolated_start_date.datetime_format = datetime_format 1356 if interpolated_end_date and not interpolated_end_date.datetime_format: 1357 interpolated_end_date.datetime_format = datetime_format 1358 1359 start_date = interpolated_start_date.get_datetime(config=config) 1360 end_date_provider = ( 1361 partial(interpolated_end_date.get_datetime, config) 1362 if interpolated_end_date 1363 else connector_state_converter.get_end_provider() 1364 ) 1365 1366 if ( 1367 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1368 ) or ( 1369 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1370 ): 1371 raise ValueError( 1372 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1373 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1374 ) 1375 1376 # When step is not defined, default to a step size from the starting date to the present moment 1377 step_length = datetime.timedelta.max 1378 interpolated_step = ( 1379 InterpolatedString.create( 1380 datetime_based_cursor_model.step, 1381 parameters=datetime_based_cursor_model.parameters or {}, 1382 ) 1383 if datetime_based_cursor_model.step 1384 else None 1385 ) 1386 if interpolated_step: 1387 evaluated_step = interpolated_step.eval(config) 1388 if evaluated_step: 1389 step_length = parse_duration(evaluated_step) 1390 1391 clamping_strategy: ClampingStrategy = NoClamping() 1392 if datetime_based_cursor_model.clamping: 1393 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1394 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1395 # object which we want to keep agnostic of being low-code 1396 target = InterpolatedString( 1397 string=datetime_based_cursor_model.clamping.target, 1398 parameters=datetime_based_cursor_model.parameters or {}, 1399 ) 1400 evaluated_target = target.eval(config=config) 1401 match evaluated_target: 1402 case "DAY": 1403 clamping_strategy = DayClampingStrategy() 1404 end_date_provider = ClampingEndProvider( 1405 DayClampingStrategy(is_ceiling=False), 1406 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1407 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1408 ) 1409 case "WEEK": 1410 if ( 1411 not datetime_based_cursor_model.clamping.target_details 1412 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1413 ): 1414 raise ValueError( 1415 "Given WEEK clamping, weekday needs to be provided as target_details" 1416 ) 1417 weekday = self._assemble_weekday( 1418 datetime_based_cursor_model.clamping.target_details["weekday"] 1419 ) 1420 clamping_strategy = WeekClampingStrategy(weekday) 1421 end_date_provider = ClampingEndProvider( 1422 WeekClampingStrategy(weekday, is_ceiling=False), 1423 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1424 granularity=cursor_granularity or datetime.timedelta(days=1), 1425 ) 1426 case "MONTH": 1427 clamping_strategy = MonthClampingStrategy() 1428 end_date_provider = ClampingEndProvider( 1429 MonthClampingStrategy(is_ceiling=False), 1430 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1431 granularity=cursor_granularity or datetime.timedelta(days=1), 1432 ) 1433 case _: 1434 raise ValueError( 1435 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1436 ) 1437 1438 return ConcurrentCursor( 1439 stream_name=stream_name, 1440 stream_namespace=stream_namespace, 1441 stream_state=stream_state, 1442 message_repository=message_repository or self._message_repository, 1443 connector_state_manager=self._connector_state_manager, 1444 connector_state_converter=connector_state_converter, 1445 cursor_field=cursor_field, 1446 slice_boundary_fields=slice_boundary_fields, 1447 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1448 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1449 lookback_window=lookback_window, 1450 slice_range=step_length, 1451 cursor_granularity=cursor_granularity, 1452 clamping_strategy=clamping_strategy, 1453 )
1455 def create_concurrent_cursor_from_incrementing_count_cursor( 1456 self, 1457 model_type: Type[BaseModel], 1458 component_definition: ComponentDefinition, 1459 stream_name: str, 1460 stream_namespace: Optional[str], 1461 config: Config, 1462 message_repository: Optional[MessageRepository] = None, 1463 **kwargs: Any, 1464 ) -> ConcurrentCursor: 1465 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1466 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1467 # incoming state and connector_state_manager that is initialized when the component factory is created 1468 stream_state = ( 1469 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1470 if "stream_state" not in kwargs 1471 else kwargs["stream_state"] 1472 ) 1473 1474 component_type = component_definition.get("type") 1475 if component_definition.get("type") != model_type.__name__: 1476 raise ValueError( 1477 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1478 ) 1479 1480 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1481 1482 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1483 raise ValueError( 1484 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1485 ) 1486 1487 interpolated_start_value = ( 1488 InterpolatedString.create( 1489 incrementing_count_cursor_model.start_value, # type: ignore 1490 parameters=incrementing_count_cursor_model.parameters or {}, 1491 ) 1492 if incrementing_count_cursor_model.start_value 1493 else 0 1494 ) 1495 1496 interpolated_cursor_field = InterpolatedString.create( 1497 incrementing_count_cursor_model.cursor_field, 1498 parameters=incrementing_count_cursor_model.parameters or {}, 1499 ) 1500 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1501 1502 connector_state_converter = IncrementingCountStreamStateConverter( 1503 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1504 ) 1505 1506 return ConcurrentCursor( 1507 stream_name=stream_name, 1508 stream_namespace=stream_namespace, 1509 stream_state=stream_state, 1510 message_repository=message_repository or self._message_repository, 1511 connector_state_manager=self._connector_state_manager, 1512 connector_state_converter=connector_state_converter, 1513 cursor_field=cursor_field, 1514 slice_boundary_fields=None, 1515 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1516 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1517 )
1538 def create_concurrent_cursor_from_perpartition_cursor( 1539 self, 1540 state_manager: ConnectorStateManager, 1541 model_type: Type[BaseModel], 1542 component_definition: ComponentDefinition, 1543 stream_name: str, 1544 stream_namespace: Optional[str], 1545 config: Config, 1546 stream_state: MutableMapping[str, Any], 1547 partition_router: PartitionRouter, 1548 stream_state_migrations: Optional[List[Any]] = None, 1549 **kwargs: Any, 1550 ) -> ConcurrentPerPartitionCursor: 1551 component_type = component_definition.get("type") 1552 if component_definition.get("type") != model_type.__name__: 1553 raise ValueError( 1554 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1555 ) 1556 1557 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1558 1559 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1560 raise ValueError( 1561 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1562 ) 1563 1564 interpolated_cursor_field = InterpolatedString.create( 1565 datetime_based_cursor_model.cursor_field, 1566 parameters=datetime_based_cursor_model.parameters or {}, 1567 ) 1568 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1569 1570 datetime_format = datetime_based_cursor_model.datetime_format 1571 1572 cursor_granularity = ( 1573 parse_duration(datetime_based_cursor_model.cursor_granularity) 1574 if datetime_based_cursor_model.cursor_granularity 1575 else None 1576 ) 1577 1578 connector_state_converter: DateTimeStreamStateConverter 1579 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1580 datetime_format=datetime_format, 1581 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1582 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1583 cursor_granularity=cursor_granularity, 1584 ) 1585 1586 # Create the cursor factory 1587 cursor_factory = ConcurrentCursorFactory( 1588 partial( 1589 self.create_concurrent_cursor_from_datetime_based_cursor, 1590 state_manager=state_manager, 1591 model_type=model_type, 1592 component_definition=component_definition, 1593 stream_name=stream_name, 1594 stream_namespace=stream_namespace, 1595 config=config, 1596 message_repository=NoopMessageRepository(), 1597 stream_state_migrations=stream_state_migrations, 1598 ) 1599 ) 1600 1601 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1602 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1603 use_global_cursor = isinstance( 1604 partition_router, GroupingPartitionRouter 1605 ) or component_definition.get("global_substream_cursor", False) 1606 1607 # Return the concurrent cursor and state converter 1608 return ConcurrentPerPartitionCursor( 1609 cursor_factory=cursor_factory, 1610 partition_router=partition_router, 1611 stream_name=stream_name, 1612 stream_namespace=stream_namespace, 1613 stream_state=stream_state, 1614 message_repository=self._message_repository, # type: ignore 1615 connector_state_manager=state_manager, 1616 connector_state_converter=connector_state_converter, 1617 cursor_field=cursor_field, 1618 use_global_cursor=use_global_cursor, 1619 )
1621 @staticmethod 1622 def create_constant_backoff_strategy( 1623 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1624 ) -> ConstantBackoffStrategy: 1625 return ConstantBackoffStrategy( 1626 backoff_time_in_seconds=model.backoff_time_in_seconds, 1627 config=config, 1628 parameters=model.parameters or {}, 1629 )
1631 def create_cursor_pagination( 1632 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1633 ) -> CursorPaginationStrategy: 1634 if isinstance(decoder, PaginationDecoderDecorator): 1635 inner_decoder = decoder.decoder 1636 else: 1637 inner_decoder = decoder 1638 decoder = PaginationDecoderDecorator(decoder=decoder) 1639 1640 if self._is_supported_decoder_for_pagination(inner_decoder): 1641 decoder_to_use = decoder 1642 else: 1643 raise ValueError( 1644 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1645 ) 1646 1647 return CursorPaginationStrategy( 1648 cursor_value=model.cursor_value, 1649 decoder=decoder_to_use, 1650 page_size=model.page_size, 1651 stop_condition=model.stop_condition, 1652 config=config, 1653 parameters=model.parameters or {}, 1654 )
1656 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1657 """ 1658 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1659 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1660 :param model: The Pydantic model of the custom component being created 1661 :param config: The custom defined connector config 1662 :return: The declarative component built from the Pydantic model to be used at runtime 1663 """ 1664 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1665 component_fields = get_type_hints(custom_component_class) 1666 model_args = model.dict() 1667 model_args["config"] = config 1668 1669 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1670 # we defer to these arguments over the component's definition 1671 for key, arg in kwargs.items(): 1672 model_args[key] = arg 1673 1674 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1675 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1676 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1677 for model_field, model_value in model_args.items(): 1678 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1679 if ( 1680 isinstance(model_value, dict) 1681 and "type" not in model_value 1682 and model_field in component_fields 1683 ): 1684 derived_type = self._derive_component_type_from_type_hints( 1685 component_fields.get(model_field) 1686 ) 1687 if derived_type: 1688 model_value["type"] = derived_type 1689 1690 if self._is_component(model_value): 1691 model_args[model_field] = self._create_nested_component( 1692 model, model_field, model_value, config 1693 ) 1694 elif isinstance(model_value, list): 1695 vals = [] 1696 for v in model_value: 1697 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1698 derived_type = self._derive_component_type_from_type_hints( 1699 component_fields.get(model_field) 1700 ) 1701 if derived_type: 1702 v["type"] = derived_type 1703 if self._is_component(v): 1704 vals.append(self._create_nested_component(model, model_field, v, config)) 1705 else: 1706 vals.append(v) 1707 model_args[model_field] = vals 1708 1709 kwargs = { 1710 class_field: model_args[class_field] 1711 for class_field in component_fields.keys() 1712 if class_field in model_args 1713 } 1714 return custom_component_class(**kwargs)
Generically creates a custom component based on the model type and a class_name reference to the custom Python class being instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor
Parameters
- model: The Pydantic model of the custom component being created
- config: The custom defined connector config
Returns
The declarative component built from the Pydantic model to be used at runtime
1846 def create_datetime_based_cursor( 1847 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1848 ) -> DatetimeBasedCursor: 1849 start_datetime: Union[str, MinMaxDatetime] = ( 1850 model.start_datetime 1851 if isinstance(model.start_datetime, str) 1852 else self.create_min_max_datetime(model.start_datetime, config) 1853 ) 1854 end_datetime: Union[str, MinMaxDatetime, None] = None 1855 if model.is_data_feed and model.end_datetime: 1856 raise ValueError("Data feed does not support end_datetime") 1857 if model.is_data_feed and model.is_client_side_incremental: 1858 raise ValueError( 1859 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1860 ) 1861 if model.end_datetime: 1862 end_datetime = ( 1863 model.end_datetime 1864 if isinstance(model.end_datetime, str) 1865 else self.create_min_max_datetime(model.end_datetime, config) 1866 ) 1867 1868 end_time_option = ( 1869 self._create_component_from_model( 1870 model.end_time_option, config, parameters=model.parameters or {} 1871 ) 1872 if model.end_time_option 1873 else None 1874 ) 1875 start_time_option = ( 1876 self._create_component_from_model( 1877 model.start_time_option, config, parameters=model.parameters or {} 1878 ) 1879 if model.start_time_option 1880 else None 1881 ) 1882 1883 return DatetimeBasedCursor( 1884 cursor_field=model.cursor_field, 1885 cursor_datetime_formats=model.cursor_datetime_formats 1886 if model.cursor_datetime_formats 1887 else [], 1888 cursor_granularity=model.cursor_granularity, 1889 datetime_format=model.datetime_format, 1890 end_datetime=end_datetime, 1891 start_datetime=start_datetime, 1892 step=model.step, 1893 end_time_option=end_time_option, 1894 lookback_window=model.lookback_window, 1895 start_time_option=start_time_option, 1896 partition_field_end=model.partition_field_end, 1897 partition_field_start=model.partition_field_start, 1898 message_repository=self._message_repository, 1899 is_compare_strictly=model.is_compare_strictly, 1900 config=config, 1901 parameters=model.parameters or {}, 1902 )
1904 def create_declarative_stream( 1905 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1906 ) -> DeclarativeStream: 1907 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1908 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1909 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1910 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1911 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1912 1913 primary_key = model.primary_key.__root__ if model.primary_key else None 1914 stop_condition_on_cursor = ( 1915 model.incremental_sync 1916 and hasattr(model.incremental_sync, "is_data_feed") 1917 and model.incremental_sync.is_data_feed 1918 ) 1919 client_side_incremental_sync = None 1920 if ( 1921 model.incremental_sync 1922 and hasattr(model.incremental_sync, "is_client_side_incremental") 1923 and model.incremental_sync.is_client_side_incremental 1924 ): 1925 supported_slicers = ( 1926 DatetimeBasedCursor, 1927 GlobalSubstreamCursor, 1928 PerPartitionWithGlobalCursor, 1929 ) 1930 if combined_slicers and not isinstance(combined_slicers, supported_slicers): 1931 raise ValueError( 1932 "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead" 1933 ) 1934 cursor = ( 1935 combined_slicers 1936 if isinstance( 1937 combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor) 1938 ) 1939 else self._create_component_from_model(model=model.incremental_sync, config=config) 1940 ) 1941 1942 client_side_incremental_sync = {"cursor": cursor} 1943 1944 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1945 cursor_model = model.incremental_sync 1946 1947 end_time_option = ( 1948 self._create_component_from_model( 1949 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1950 ) 1951 if cursor_model.end_time_option 1952 else None 1953 ) 1954 start_time_option = ( 1955 self._create_component_from_model( 1956 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1957 ) 1958 if cursor_model.start_time_option 1959 else None 1960 ) 1961 1962 request_options_provider = DatetimeBasedRequestOptionsProvider( 1963 start_time_option=start_time_option, 1964 end_time_option=end_time_option, 1965 partition_field_start=cursor_model.partition_field_end, 1966 partition_field_end=cursor_model.partition_field_end, 1967 config=config, 1968 parameters=model.parameters or {}, 1969 ) 1970 elif model.incremental_sync and isinstance( 1971 model.incremental_sync, IncrementingCountCursorModel 1972 ): 1973 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1974 1975 start_time_option = ( 1976 self._create_component_from_model( 1977 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1978 config, 1979 parameters=cursor_model.parameters or {}, 1980 ) 1981 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1982 else None 1983 ) 1984 1985 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1986 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1987 partition_field_start = "start" 1988 1989 request_options_provider = DatetimeBasedRequestOptionsProvider( 1990 start_time_option=start_time_option, 1991 partition_field_start=partition_field_start, 1992 config=config, 1993 parameters=model.parameters or {}, 1994 ) 1995 else: 1996 request_options_provider = None 1997 1998 transformations = [] 1999 if model.transformations: 2000 for transformation_model in model.transformations: 2001 transformations.append( 2002 self._create_component_from_model(model=transformation_model, config=config) 2003 ) 2004 file_uploader = None 2005 if model.file_uploader: 2006 file_uploader = self._create_component_from_model( 2007 model=model.file_uploader, config=config 2008 ) 2009 2010 retriever = self._create_component_from_model( 2011 model=model.retriever, 2012 config=config, 2013 name=model.name, 2014 primary_key=primary_key, 2015 stream_slicer=combined_slicers, 2016 request_options_provider=request_options_provider, 2017 stop_condition_on_cursor=stop_condition_on_cursor, 2018 client_side_incremental_sync=client_side_incremental_sync, 2019 transformations=transformations, 2020 file_uploader=file_uploader, 2021 incremental_sync=model.incremental_sync, 2022 ) 2023 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 2024 2025 if model.state_migrations: 2026 state_transformations = [ 2027 self._create_component_from_model(state_migration, config, declarative_stream=model) 2028 for state_migration in model.state_migrations 2029 ] 2030 else: 2031 state_transformations = [] 2032 2033 schema_loader: Union[ 2034 CompositeSchemaLoader, 2035 DefaultSchemaLoader, 2036 DynamicSchemaLoader, 2037 InlineSchemaLoader, 2038 JsonFileSchemaLoader, 2039 ] 2040 if model.schema_loader and isinstance(model.schema_loader, list): 2041 nested_schema_loaders = [ 2042 self._create_component_from_model(model=nested_schema_loader, config=config) 2043 for nested_schema_loader in model.schema_loader 2044 ] 2045 schema_loader = CompositeSchemaLoader( 2046 schema_loaders=nested_schema_loaders, parameters={} 2047 ) 2048 elif model.schema_loader: 2049 schema_loader = self._create_component_from_model( 2050 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2051 config=config, 2052 ) 2053 else: 2054 options = model.parameters or {} 2055 if "name" not in options: 2056 options["name"] = model.name 2057 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2058 2059 return DeclarativeStream( 2060 name=model.name or "", 2061 primary_key=primary_key, 2062 retriever=retriever, 2063 schema_loader=schema_loader, 2064 stream_cursor_field=cursor_field or "", 2065 state_migrations=state_transformations, 2066 config=config, 2067 parameters=model.parameters or {}, 2068 )
2237 def create_default_error_handler( 2238 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2239 ) -> DefaultErrorHandler: 2240 backoff_strategies = [] 2241 if model.backoff_strategies: 2242 for backoff_strategy_model in model.backoff_strategies: 2243 backoff_strategies.append( 2244 self._create_component_from_model(model=backoff_strategy_model, config=config) 2245 ) 2246 2247 response_filters = [] 2248 if model.response_filters: 2249 for response_filter_model in model.response_filters: 2250 response_filters.append( 2251 self._create_component_from_model(model=response_filter_model, config=config) 2252 ) 2253 response_filters.append( 2254 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2255 ) 2256 2257 return DefaultErrorHandler( 2258 backoff_strategies=backoff_strategies, 2259 max_retries=model.max_retries, 2260 response_filters=response_filters, 2261 config=config, 2262 parameters=model.parameters or {}, 2263 )
2265 def create_default_paginator( 2266 self, 2267 model: DefaultPaginatorModel, 2268 config: Config, 2269 *, 2270 url_base: str, 2271 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2272 decoder: Optional[Decoder] = None, 2273 cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None, 2274 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2275 if decoder: 2276 if self._is_supported_decoder_for_pagination(decoder): 2277 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2278 else: 2279 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2280 else: 2281 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2282 page_size_option = ( 2283 self._create_component_from_model(model=model.page_size_option, config=config) 2284 if model.page_size_option 2285 else None 2286 ) 2287 page_token_option = ( 2288 self._create_component_from_model(model=model.page_token_option, config=config) 2289 if model.page_token_option 2290 else None 2291 ) 2292 pagination_strategy = self._create_component_from_model( 2293 model=model.pagination_strategy, 2294 config=config, 2295 decoder=decoder_to_use, 2296 extractor_model=extractor_model, 2297 ) 2298 if cursor_used_for_stop_condition: 2299 pagination_strategy = StopConditionPaginationStrategyDecorator( 2300 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2301 ) 2302 paginator = DefaultPaginator( 2303 decoder=decoder_to_use, 2304 page_size_option=page_size_option, 2305 page_token_option=page_token_option, 2306 pagination_strategy=pagination_strategy, 2307 url_base=url_base, 2308 config=config, 2309 parameters=model.parameters or {}, 2310 ) 2311 if self._limit_pages_fetched_per_slice: 2312 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2313 return paginator
2315 def create_dpath_extractor( 2316 self, 2317 model: DpathExtractorModel, 2318 config: Config, 2319 decoder: Optional[Decoder] = None, 2320 **kwargs: Any, 2321 ) -> DpathExtractor: 2322 if decoder: 2323 decoder_to_use = decoder 2324 else: 2325 decoder_to_use = JsonDecoder(parameters={}) 2326 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2327 return DpathExtractor( 2328 decoder=decoder_to_use, 2329 field_path=model_field_path, 2330 config=config, 2331 parameters=model.parameters or {}, 2332 )
2353 def create_http_requester( 2354 self, 2355 model: HttpRequesterModel, 2356 config: Config, 2357 decoder: Decoder = JsonDecoder(parameters={}), 2358 query_properties_key: Optional[str] = None, 2359 use_cache: Optional[bool] = None, 2360 *, 2361 name: str, 2362 ) -> HttpRequester: 2363 authenticator = ( 2364 self._create_component_from_model( 2365 model=model.authenticator, 2366 config=config, 2367 url_base=model.url or model.url_base, 2368 name=name, 2369 decoder=decoder, 2370 ) 2371 if model.authenticator 2372 else None 2373 ) 2374 error_handler = ( 2375 self._create_component_from_model(model=model.error_handler, config=config) 2376 if model.error_handler 2377 else DefaultErrorHandler( 2378 backoff_strategies=[], 2379 response_filters=[], 2380 config=config, 2381 parameters=model.parameters or {}, 2382 ) 2383 ) 2384 2385 api_budget = self._api_budget 2386 2387 # Removes QueryProperties components from the interpolated mappings because it has been designed 2388 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2389 # instead of through jinja interpolation 2390 request_parameters: Optional[Union[str, Mapping[str, str]]] 2391 if isinstance(model.request_parameters, Mapping): 2392 request_parameters = self._remove_query_properties(model.request_parameters) 2393 else: 2394 request_parameters = model.request_parameters 2395 2396 request_options_provider = InterpolatedRequestOptionsProvider( 2397 request_body=model.request_body, 2398 request_body_data=model.request_body_data, 2399 request_body_json=model.request_body_json, 2400 request_headers=model.request_headers, 2401 request_parameters=request_parameters, 2402 query_properties_key=query_properties_key, 2403 config=config, 2404 parameters=model.parameters or {}, 2405 ) 2406 2407 assert model.use_cache is not None # for mypy 2408 assert model.http_method is not None # for mypy 2409 2410 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2411 2412 return HttpRequester( 2413 name=name, 2414 url=model.url, 2415 url_base=model.url_base, 2416 path=model.path, 2417 authenticator=authenticator, 2418 error_handler=error_handler, 2419 api_budget=api_budget, 2420 http_method=HttpMethod[model.http_method.value], 2421 request_options_provider=request_options_provider, 2422 config=config, 2423 disable_retries=self._disable_retries, 2424 parameters=model.parameters or {}, 2425 message_repository=self._message_repository, 2426 use_cache=should_use_cache, 2427 decoder=decoder, 2428 stream_response=decoder.is_stream_response() if decoder else False, 2429 )
2431 @staticmethod 2432 def create_http_response_filter( 2433 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2434 ) -> HttpResponseFilter: 2435 if model.action: 2436 action = ResponseAction(model.action.value) 2437 else: 2438 action = None 2439 2440 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2441 2442 http_codes = ( 2443 set(model.http_codes) if model.http_codes else set() 2444 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2445 2446 return HttpResponseFilter( 2447 action=action, 2448 failure_type=failure_type, 2449 error_message=model.error_message or "", 2450 error_message_contains=model.error_message_contains or "", 2451 http_codes=http_codes, 2452 predicate=model.predicate or "", 2453 config=config, 2454 parameters=model.parameters or {}, 2455 )
2463 def create_complex_field_type( 2464 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2465 ) -> ComplexFieldType: 2466 items = ( 2467 self._create_component_from_model(model=model.items, config=config) 2468 if isinstance(model.items, ComplexFieldTypeModel) 2469 else model.items 2470 ) 2471 2472 return ComplexFieldType(field_type=model.field_type, items=items)
2474 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2475 target_type = ( 2476 self._create_component_from_model(model=model.target_type, config=config) 2477 if isinstance(model.target_type, ComplexFieldTypeModel) 2478 else model.target_type 2479 ) 2480 2481 return TypesMap( 2482 target_type=target_type, 2483 current_type=model.current_type, 2484 condition=model.condition if model.condition is not None else "True", 2485 )
2487 def create_schema_type_identifier( 2488 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2489 ) -> SchemaTypeIdentifier: 2490 types_mapping = [] 2491 if model.types_mapping: 2492 types_mapping.extend( 2493 [ 2494 self._create_component_from_model(types_map, config=config) 2495 for types_map in model.types_mapping 2496 ] 2497 ) 2498 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2499 [x for x in model.schema_pointer] if model.schema_pointer else [] 2500 ) 2501 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2502 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2503 [x for x in model.type_pointer] if model.type_pointer else None 2504 ) 2505 2506 return SchemaTypeIdentifier( 2507 schema_pointer=model_schema_pointer, 2508 key_pointer=model_key_pointer, 2509 type_pointer=model_type_pointer, 2510 types_mapping=types_mapping, 2511 parameters=model.parameters or {}, 2512 )
2514 def create_dynamic_schema_loader( 2515 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2516 ) -> DynamicSchemaLoader: 2517 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2518 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2519 2520 schema_transformations = [] 2521 if model.schema_transformations: 2522 for transformation_model in model.schema_transformations: 2523 schema_transformations.append( 2524 self._create_component_from_model(model=transformation_model, config=config) 2525 ) 2526 name = "dynamic_properties" 2527 retriever = self._create_component_from_model( 2528 model=model.retriever, 2529 config=config, 2530 name=name, 2531 primary_key=None, 2532 stream_slicer=combined_slicers, 2533 transformations=[], 2534 use_cache=True, 2535 log_formatter=( 2536 lambda response: format_http_message( 2537 response, 2538 f"Schema loader '{name}' request", 2539 f"Request performed in order to extract schema.", 2540 name, 2541 is_auxiliary=True, 2542 ) 2543 ), 2544 ) 2545 schema_type_identifier = self._create_component_from_model( 2546 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2547 ) 2548 schema_filter = ( 2549 self._create_component_from_model( 2550 model.schema_filter, config=config, parameters=model.parameters or {} 2551 ) 2552 if model.schema_filter is not None 2553 else None 2554 ) 2555 2556 return DynamicSchemaLoader( 2557 retriever=retriever, 2558 config=config, 2559 schema_transformations=schema_transformations, 2560 schema_filter=schema_filter, 2561 schema_type_identifier=schema_type_identifier, 2562 parameters=model.parameters or {}, 2563 )
2583 def create_gzip_decoder( 2584 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2585 ) -> Decoder: 2586 _compressed_response_types = { 2587 "gzip", 2588 "x-gzip", 2589 "gzip, deflate", 2590 "x-gzip, deflate", 2591 "application/zip", 2592 "application/gzip", 2593 "application/x-gzip", 2594 "application/x-zip-compressed", 2595 } 2596 2597 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2598 2599 if self._emit_connector_builder_messages: 2600 # This is very surprising but if the response is not streamed, 2601 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2602 # which uses urllib3 directly and does not uncompress the data. 2603 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2604 2605 return CompositeRawDecoder.by_headers( 2606 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2607 stream_response=True, 2608 fallback_parser=gzip_parser.inner_parser, 2609 )
2611 @staticmethod 2612 def create_incrementing_count_cursor( 2613 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2614 ) -> DatetimeBasedCursor: 2615 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2616 # we still parse models into components. The issue is that there's no runtime implementation of a 2617 # IncrementingCountCursor. 2618 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2619 return DatetimeBasedCursor( 2620 cursor_field=model.cursor_field, 2621 datetime_format="%Y-%m-%d", 2622 start_datetime="2024-12-12", 2623 config=config, 2624 parameters={}, 2625 )
2670 @staticmethod 2671 def create_jwt_authenticator( 2672 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2673 ) -> JwtAuthenticator: 2674 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2675 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2676 return JwtAuthenticator( 2677 config=config, 2678 parameters=model.parameters or {}, 2679 algorithm=JwtAlgorithm(model.algorithm.value), 2680 secret_key=model.secret_key, 2681 base64_encode_secret_key=model.base64_encode_secret_key, 2682 token_duration=model.token_duration, 2683 header_prefix=model.header_prefix, 2684 kid=jwt_headers.kid, 2685 typ=jwt_headers.typ, 2686 cty=jwt_headers.cty, 2687 iss=jwt_payload.iss, 2688 sub=jwt_payload.sub, 2689 aud=jwt_payload.aud, 2690 additional_jwt_headers=model.additional_jwt_headers, 2691 additional_jwt_payload=model.additional_jwt_payload, 2692 )
2694 def create_list_partition_router( 2695 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2696 ) -> ListPartitionRouter: 2697 request_option = ( 2698 self._create_component_from_model(model.request_option, config) 2699 if model.request_option 2700 else None 2701 ) 2702 return ListPartitionRouter( 2703 cursor_field=model.cursor_field, 2704 request_option=request_option, 2705 values=model.values, 2706 config=config, 2707 parameters=model.parameters or {}, 2708 )
2710 @staticmethod 2711 def create_min_max_datetime( 2712 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2713 ) -> MinMaxDatetime: 2714 return MinMaxDatetime( 2715 datetime=model.datetime, 2716 datetime_format=model.datetime_format or "", 2717 max_datetime=model.max_datetime or "", 2718 min_datetime=model.min_datetime or "", 2719 parameters=model.parameters or {}, 2720 )
2732 def create_oauth_authenticator( 2733 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2734 ) -> DeclarativeOauth2Authenticator: 2735 profile_assertion = ( 2736 self._create_component_from_model(model.profile_assertion, config=config) 2737 if model.profile_assertion 2738 else None 2739 ) 2740 2741 if model.refresh_token_updater: 2742 # ignore type error because fixing it would have a lot of dependencies, revisit later 2743 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2744 config, 2745 InterpolatedString.create( 2746 model.token_refresh_endpoint, # type: ignore 2747 parameters=model.parameters or {}, 2748 ).eval(config), 2749 access_token_name=InterpolatedString.create( 2750 model.access_token_name or "access_token", parameters=model.parameters or {} 2751 ).eval(config), 2752 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2753 expires_in_name=InterpolatedString.create( 2754 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2755 ).eval(config), 2756 client_id_name=InterpolatedString.create( 2757 model.client_id_name or "client_id", parameters=model.parameters or {} 2758 ).eval(config), 2759 client_id=InterpolatedString.create( 2760 model.client_id, parameters=model.parameters or {} 2761 ).eval(config) 2762 if model.client_id 2763 else model.client_id, 2764 client_secret_name=InterpolatedString.create( 2765 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2766 ).eval(config), 2767 client_secret=InterpolatedString.create( 2768 model.client_secret, parameters=model.parameters or {} 2769 ).eval(config) 2770 if model.client_secret 2771 else model.client_secret, 2772 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2773 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2774 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2775 grant_type_name=InterpolatedString.create( 2776 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2777 ).eval(config), 2778 grant_type=InterpolatedString.create( 2779 model.grant_type or "refresh_token", parameters=model.parameters or {} 2780 ).eval(config), 2781 refresh_request_body=InterpolatedMapping( 2782 model.refresh_request_body or {}, parameters=model.parameters or {} 2783 ).eval(config), 2784 refresh_request_headers=InterpolatedMapping( 2785 model.refresh_request_headers or {}, parameters=model.parameters or {} 2786 ).eval(config), 2787 scopes=model.scopes, 2788 token_expiry_date_format=model.token_expiry_date_format, 2789 message_repository=self._message_repository, 2790 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2791 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2792 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2793 ) 2794 # ignore type error because fixing it would have a lot of dependencies, revisit later 2795 return DeclarativeOauth2Authenticator( # type: ignore 2796 access_token_name=model.access_token_name or "access_token", 2797 access_token_value=model.access_token_value, 2798 client_id_name=model.client_id_name or "client_id", 2799 client_id=model.client_id, 2800 client_secret_name=model.client_secret_name or "client_secret", 2801 client_secret=model.client_secret, 2802 expires_in_name=model.expires_in_name or "expires_in", 2803 grant_type_name=model.grant_type_name or "grant_type", 2804 grant_type=model.grant_type or "refresh_token", 2805 refresh_request_body=model.refresh_request_body, 2806 refresh_request_headers=model.refresh_request_headers, 2807 refresh_token_name=model.refresh_token_name or "refresh_token", 2808 refresh_token=model.refresh_token, 2809 scopes=model.scopes, 2810 token_expiry_date=model.token_expiry_date, 2811 token_expiry_date_format=model.token_expiry_date_format, 2812 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2813 token_refresh_endpoint=model.token_refresh_endpoint, 2814 config=config, 2815 parameters=model.parameters or {}, 2816 message_repository=self._message_repository, 2817 profile_assertion=profile_assertion, 2818 use_profile_assertion=model.use_profile_assertion, 2819 )
2821 def create_offset_increment( 2822 self, 2823 model: OffsetIncrementModel, 2824 config: Config, 2825 decoder: Decoder, 2826 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2827 **kwargs: Any, 2828 ) -> OffsetIncrement: 2829 if isinstance(decoder, PaginationDecoderDecorator): 2830 inner_decoder = decoder.decoder 2831 else: 2832 inner_decoder = decoder 2833 decoder = PaginationDecoderDecorator(decoder=decoder) 2834 2835 if self._is_supported_decoder_for_pagination(inner_decoder): 2836 decoder_to_use = decoder 2837 else: 2838 raise ValueError( 2839 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2840 ) 2841 2842 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2843 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2844 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2845 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2846 # When we have more time to investigate we can look into reusing the same component. 2847 extractor = ( 2848 self._create_component_from_model( 2849 model=extractor_model, config=config, decoder=decoder_to_use 2850 ) 2851 if extractor_model 2852 else None 2853 ) 2854 2855 return OffsetIncrement( 2856 page_size=model.page_size, 2857 config=config, 2858 decoder=decoder_to_use, 2859 extractor=extractor, 2860 inject_on_first_request=model.inject_on_first_request or False, 2861 parameters=model.parameters or {}, 2862 )
2864 @staticmethod 2865 def create_page_increment( 2866 model: PageIncrementModel, config: Config, **kwargs: Any 2867 ) -> PageIncrement: 2868 return PageIncrement( 2869 page_size=model.page_size, 2870 config=config, 2871 start_from_page=model.start_from_page or 0, 2872 inject_on_first_request=model.inject_on_first_request or False, 2873 parameters=model.parameters or {}, 2874 )
2876 def create_parent_stream_config( 2877 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2878 ) -> ParentStreamConfig: 2879 declarative_stream = self._create_component_from_model( 2880 model.stream, config=config, **kwargs 2881 ) 2882 request_option = ( 2883 self._create_component_from_model(model.request_option, config=config) 2884 if model.request_option 2885 else None 2886 ) 2887 2888 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2889 raise ValueError( 2890 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2891 ) 2892 2893 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2894 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2895 ) 2896 2897 return ParentStreamConfig( 2898 parent_key=model.parent_key, 2899 request_option=request_option, 2900 stream=declarative_stream, 2901 partition_field=model.partition_field, 2902 config=config, 2903 incremental_dependency=model.incremental_dependency or False, 2904 parameters=model.parameters or {}, 2905 extra_fields=model.extra_fields, 2906 lazy_read_pointer=model_lazy_read_pointer, 2907 )
2909 def create_properties_from_endpoint( 2910 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2911 ) -> PropertiesFromEndpoint: 2912 retriever = self._create_component_from_model( 2913 model=model.retriever, 2914 config=config, 2915 name="dynamic_properties", 2916 primary_key=None, 2917 stream_slicer=None, 2918 transformations=[], 2919 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2920 ) 2921 return PropertiesFromEndpoint( 2922 property_field_path=model.property_field_path, 2923 retriever=retriever, 2924 config=config, 2925 parameters=model.parameters or {}, 2926 )
2928 def create_property_chunking( 2929 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2930 ) -> PropertyChunking: 2931 record_merge_strategy = ( 2932 self._create_component_from_model( 2933 model=model.record_merge_strategy, config=config, **kwargs 2934 ) 2935 if model.record_merge_strategy 2936 else None 2937 ) 2938 2939 property_limit_type: PropertyLimitType 2940 match model.property_limit_type: 2941 case PropertyLimitTypeModel.property_count: 2942 property_limit_type = PropertyLimitType.property_count 2943 case PropertyLimitTypeModel.characters: 2944 property_limit_type = PropertyLimitType.characters 2945 case _: 2946 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2947 2948 return PropertyChunking( 2949 property_limit_type=property_limit_type, 2950 property_limit=model.property_limit, 2951 record_merge_strategy=record_merge_strategy, 2952 config=config, 2953 parameters=model.parameters or {}, 2954 )
2956 def create_query_properties( 2957 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2958 ) -> QueryProperties: 2959 if isinstance(model.property_list, list): 2960 property_list = model.property_list 2961 else: 2962 property_list = self._create_component_from_model( 2963 model=model.property_list, config=config, **kwargs 2964 ) 2965 2966 property_chunking = ( 2967 self._create_component_from_model( 2968 model=model.property_chunking, config=config, **kwargs 2969 ) 2970 if model.property_chunking 2971 else None 2972 ) 2973 2974 return QueryProperties( 2975 property_list=property_list, 2976 always_include_properties=model.always_include_properties, 2977 property_chunking=property_chunking, 2978 config=config, 2979 parameters=model.parameters or {}, 2980 )
2994 @staticmethod 2995 def create_request_option( 2996 model: RequestOptionModel, config: Config, **kwargs: Any 2997 ) -> RequestOption: 2998 inject_into = RequestOptionType(model.inject_into.value) 2999 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3000 [ 3001 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3002 for segment in model.field_path 3003 ] 3004 if model.field_path 3005 else None 3006 ) 3007 field_name = ( 3008 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3009 if model.field_name 3010 else None 3011 ) 3012 return RequestOption( 3013 field_name=field_name, 3014 field_path=field_path, 3015 inject_into=inject_into, 3016 parameters=kwargs.get("parameters", {}), 3017 )
3019 def create_record_selector( 3020 self, 3021 model: RecordSelectorModel, 3022 config: Config, 3023 *, 3024 name: str, 3025 transformations: List[RecordTransformation] | None = None, 3026 decoder: Decoder | None = None, 3027 client_side_incremental_sync: Dict[str, Any] | None = None, 3028 file_uploader: Optional[DefaultFileUploader] = None, 3029 **kwargs: Any, 3030 ) -> RecordSelector: 3031 extractor = self._create_component_from_model( 3032 model=model.extractor, decoder=decoder, config=config 3033 ) 3034 record_filter = ( 3035 self._create_component_from_model(model.record_filter, config=config) 3036 if model.record_filter 3037 else None 3038 ) 3039 3040 transform_before_filtering = ( 3041 False if model.transform_before_filtering is None else model.transform_before_filtering 3042 ) 3043 if client_side_incremental_sync: 3044 record_filter = ClientSideIncrementalRecordFilterDecorator( 3045 config=config, 3046 parameters=model.parameters, 3047 condition=model.record_filter.condition 3048 if (model.record_filter and hasattr(model.record_filter, "condition")) 3049 else None, 3050 **client_side_incremental_sync, 3051 ) 3052 transform_before_filtering = ( 3053 True 3054 if model.transform_before_filtering is None 3055 else model.transform_before_filtering 3056 ) 3057 3058 if model.schema_normalization is None: 3059 # default to no schema normalization if not set 3060 model.schema_normalization = SchemaNormalizationModel.None_ 3061 3062 schema_normalization = ( 3063 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3064 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3065 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3066 ) 3067 3068 return RecordSelector( 3069 extractor=extractor, 3070 name=name, 3071 config=config, 3072 record_filter=record_filter, 3073 transformations=transformations or [], 3074 file_uploader=file_uploader, 3075 schema_normalization=schema_normalization, 3076 parameters=model.parameters or {}, 3077 transform_before_filtering=transform_before_filtering, 3078 )
3088 def create_selective_authenticator( 3089 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3090 ) -> DeclarativeAuthenticator: 3091 authenticators = { 3092 name: self._create_component_from_model(model=auth, config=config) 3093 for name, auth in model.authenticators.items() 3094 } 3095 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3096 return SelectiveAuthenticator( # type: ignore[abstract] 3097 config=config, 3098 authenticators=authenticators, 3099 authenticator_selection_path=model.authenticator_selection_path, 3100 **kwargs, 3101 )
3103 @staticmethod 3104 def create_legacy_session_token_authenticator( 3105 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3106 ) -> LegacySessionTokenAuthenticator: 3107 return LegacySessionTokenAuthenticator( 3108 api_url=url_base, 3109 header=model.header, 3110 login_url=model.login_url, 3111 password=model.password or "", 3112 session_token=model.session_token or "", 3113 session_token_response_key=model.session_token_response_key or "", 3114 username=model.username or "", 3115 validate_session_url=model.validate_session_url, 3116 config=config, 3117 parameters=model.parameters or {}, 3118 )
3120 def create_simple_retriever( 3121 self, 3122 model: SimpleRetrieverModel, 3123 config: Config, 3124 *, 3125 name: str, 3126 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3127 stream_slicer: Optional[StreamSlicer], 3128 request_options_provider: Optional[RequestOptionsProvider] = None, 3129 stop_condition_on_cursor: bool = False, 3130 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3131 transformations: List[RecordTransformation], 3132 file_uploader: Optional[DefaultFileUploader] = None, 3133 incremental_sync: Optional[ 3134 Union[ 3135 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 3136 ] 3137 ] = None, 3138 use_cache: Optional[bool] = None, 3139 log_formatter: Optional[Callable[[Response], Any]] = None, 3140 **kwargs: Any, 3141 ) -> SimpleRetriever: 3142 def _get_url() -> str: 3143 """ 3144 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3145 This is needed because the URL is not set until the requester is created. 3146 """ 3147 3148 _url = ( 3149 model.requester.url 3150 if hasattr(model.requester, "url") and model.requester.url is not None 3151 else requester.get_url() 3152 ) 3153 _url_base = ( 3154 model.requester.url_base 3155 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3156 else requester.get_url_base() 3157 ) 3158 3159 return _url or _url_base 3160 3161 decoder = ( 3162 self._create_component_from_model(model=model.decoder, config=config) 3163 if model.decoder 3164 else JsonDecoder(parameters={}) 3165 ) 3166 record_selector = self._create_component_from_model( 3167 model=model.record_selector, 3168 name=name, 3169 config=config, 3170 decoder=decoder, 3171 transformations=transformations, 3172 client_side_incremental_sync=client_side_incremental_sync, 3173 file_uploader=file_uploader, 3174 ) 3175 3176 query_properties: Optional[QueryProperties] = None 3177 query_properties_key: Optional[str] = None 3178 if self._query_properties_in_request_parameters(model.requester): 3179 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3180 # places instead of default to request_parameters which isn't clearly documented 3181 if ( 3182 hasattr(model.requester, "fetch_properties_from_endpoint") 3183 and model.requester.fetch_properties_from_endpoint 3184 ): 3185 raise ValueError( 3186 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3187 ) 3188 3189 query_properties_definitions = [] 3190 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3191 if isinstance(request_parameter, QueryPropertiesModel): 3192 query_properties_key = key 3193 query_properties_definitions.append(request_parameter) 3194 3195 if len(query_properties_definitions) > 1: 3196 raise ValueError( 3197 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3198 ) 3199 3200 if len(query_properties_definitions) == 1: 3201 query_properties = self._create_component_from_model( 3202 model=query_properties_definitions[0], config=config 3203 ) 3204 elif ( 3205 hasattr(model.requester, "fetch_properties_from_endpoint") 3206 and model.requester.fetch_properties_from_endpoint 3207 ): 3208 query_properties_definition = QueryPropertiesModel( 3209 type="QueryProperties", 3210 property_list=model.requester.fetch_properties_from_endpoint, 3211 always_include_properties=None, 3212 property_chunking=None, 3213 ) # type: ignore # $parameters has a default value 3214 3215 query_properties = self.create_query_properties( 3216 model=query_properties_definition, 3217 config=config, 3218 ) 3219 3220 requester = self._create_component_from_model( 3221 model=model.requester, 3222 decoder=decoder, 3223 name=name, 3224 query_properties_key=query_properties_key, 3225 use_cache=use_cache, 3226 config=config, 3227 ) 3228 3229 # Define cursor only if per partition or common incremental support is needed 3230 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3231 3232 if ( 3233 not isinstance(stream_slicer, DatetimeBasedCursor) 3234 or type(stream_slicer) is not DatetimeBasedCursor 3235 ): 3236 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3237 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3238 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3239 # request_options_provider 3240 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3241 elif not request_options_provider: 3242 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3243 3244 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3245 3246 cursor_used_for_stop_condition = cursor if stop_condition_on_cursor else None 3247 paginator = ( 3248 self._create_component_from_model( 3249 model=model.paginator, 3250 config=config, 3251 url_base=_get_url(), 3252 extractor_model=model.record_selector.extractor, 3253 decoder=decoder, 3254 cursor_used_for_stop_condition=cursor_used_for_stop_condition, 3255 ) 3256 if model.paginator 3257 else NoPagination(parameters={}) 3258 ) 3259 3260 ignore_stream_slicer_parameters_on_paginated_requests = ( 3261 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3262 ) 3263 3264 if ( 3265 model.partition_router 3266 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3267 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3268 and any( 3269 parent_stream_config.lazy_read_pointer 3270 for parent_stream_config in model.partition_router.parent_stream_configs 3271 ) 3272 ): 3273 if incremental_sync: 3274 if incremental_sync.type != "DatetimeBasedCursor": 3275 raise ValueError( 3276 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3277 ) 3278 3279 elif incremental_sync.step or incremental_sync.cursor_granularity: 3280 raise ValueError( 3281 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3282 ) 3283 3284 if model.decoder and model.decoder.type != "JsonDecoder": 3285 raise ValueError( 3286 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3287 ) 3288 3289 return LazySimpleRetriever( 3290 name=name, 3291 paginator=paginator, 3292 primary_key=primary_key, 3293 requester=requester, 3294 record_selector=record_selector, 3295 stream_slicer=stream_slicer, 3296 request_option_provider=request_options_provider, 3297 cursor=cursor, 3298 config=config, 3299 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3300 parameters=model.parameters or {}, 3301 ) 3302 3303 if self._limit_slices_fetched or self._emit_connector_builder_messages: 3304 return SimpleRetrieverTestReadDecorator( 3305 name=name, 3306 paginator=paginator, 3307 primary_key=primary_key, 3308 requester=requester, 3309 record_selector=record_selector, 3310 stream_slicer=stream_slicer, 3311 request_option_provider=request_options_provider, 3312 cursor=cursor, 3313 config=config, 3314 maximum_number_of_slices=self._limit_slices_fetched or 5, 3315 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3316 log_formatter=log_formatter, 3317 parameters=model.parameters or {}, 3318 ) 3319 return SimpleRetriever( 3320 name=name, 3321 paginator=paginator, 3322 primary_key=primary_key, 3323 requester=requester, 3324 record_selector=record_selector, 3325 stream_slicer=stream_slicer, 3326 request_option_provider=request_options_provider, 3327 cursor=cursor, 3328 config=config, 3329 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3330 additional_query_properties=query_properties, 3331 parameters=model.parameters or {}, 3332 )
3357 def create_state_delegating_stream( 3358 self, 3359 model: StateDelegatingStreamModel, 3360 config: Config, 3361 has_parent_state: Optional[bool] = None, 3362 **kwargs: Any, 3363 ) -> DeclarativeStream: 3364 if ( 3365 model.full_refresh_stream.name != model.name 3366 or model.name != model.incremental_stream.name 3367 ): 3368 raise ValueError( 3369 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3370 ) 3371 3372 stream_model = ( 3373 model.incremental_stream 3374 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3375 else model.full_refresh_stream 3376 ) 3377 3378 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description
3410 def create_async_retriever( 3411 self, 3412 model: AsyncRetrieverModel, 3413 config: Config, 3414 *, 3415 name: str, 3416 primary_key: Optional[ 3417 Union[str, List[str], List[List[str]]] 3418 ], # this seems to be needed to match create_simple_retriever 3419 stream_slicer: Optional[StreamSlicer], 3420 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3421 transformations: List[RecordTransformation], 3422 **kwargs: Any, 3423 ) -> AsyncRetriever: 3424 def _get_download_retriever() -> SimpleRetrieverTestReadDecorator | SimpleRetriever: 3425 record_selector = RecordSelector( 3426 extractor=download_extractor, 3427 name=name, 3428 record_filter=None, 3429 transformations=transformations, 3430 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3431 config=config, 3432 parameters={}, 3433 ) 3434 paginator = ( 3435 self._create_component_from_model( 3436 model=model.download_paginator, 3437 decoder=decoder, 3438 config=config, 3439 url_base="", 3440 ) 3441 if model.download_paginator 3442 else NoPagination(parameters={}) 3443 ) 3444 maximum_number_of_slices = self._limit_slices_fetched or 5 3445 3446 if self._limit_slices_fetched or self._emit_connector_builder_messages: 3447 return SimpleRetrieverTestReadDecorator( 3448 requester=download_requester, 3449 record_selector=record_selector, 3450 primary_key=None, 3451 name=job_download_components_name, 3452 paginator=paginator, 3453 config=config, 3454 parameters={}, 3455 maximum_number_of_slices=maximum_number_of_slices, 3456 ) 3457 3458 return SimpleRetriever( 3459 requester=download_requester, 3460 record_selector=record_selector, 3461 primary_key=None, 3462 name=job_download_components_name, 3463 paginator=paginator, 3464 config=config, 3465 parameters={}, 3466 ) 3467 3468 def _get_job_timeout() -> datetime.timedelta: 3469 user_defined_timeout: Optional[int] = ( 3470 int( 3471 InterpolatedString.create( 3472 str(model.polling_job_timeout), 3473 parameters={}, 3474 ).eval(config) 3475 ) 3476 if model.polling_job_timeout 3477 else None 3478 ) 3479 3480 # check for user defined timeout during the test read or 15 minutes 3481 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3482 # default value for non-connector builder is 60 minutes. 3483 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3484 3485 return ( 3486 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3487 ) 3488 3489 decoder = ( 3490 self._create_component_from_model(model=model.decoder, config=config) 3491 if model.decoder 3492 else JsonDecoder(parameters={}) 3493 ) 3494 record_selector = self._create_component_from_model( 3495 model=model.record_selector, 3496 config=config, 3497 decoder=decoder, 3498 name=name, 3499 transformations=transformations, 3500 client_side_incremental_sync=client_side_incremental_sync, 3501 ) 3502 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3503 creation_requester = self._create_component_from_model( 3504 model=model.creation_requester, 3505 decoder=decoder, 3506 config=config, 3507 name=f"job creation - {name}", 3508 ) 3509 polling_requester = self._create_component_from_model( 3510 model=model.polling_requester, 3511 decoder=decoder, 3512 config=config, 3513 name=f"job polling - {name}", 3514 ) 3515 job_download_components_name = f"job download - {name}" 3516 download_decoder = ( 3517 self._create_component_from_model(model=model.download_decoder, config=config) 3518 if model.download_decoder 3519 else JsonDecoder(parameters={}) 3520 ) 3521 download_extractor = ( 3522 self._create_component_from_model( 3523 model=model.download_extractor, 3524 config=config, 3525 decoder=download_decoder, 3526 parameters=model.parameters, 3527 ) 3528 if model.download_extractor 3529 else DpathExtractor( 3530 [], 3531 config=config, 3532 decoder=download_decoder, 3533 parameters=model.parameters or {}, 3534 ) 3535 ) 3536 download_requester = self._create_component_from_model( 3537 model=model.download_requester, 3538 decoder=download_decoder, 3539 config=config, 3540 name=job_download_components_name, 3541 ) 3542 download_retriever = _get_download_retriever() 3543 abort_requester = ( 3544 self._create_component_from_model( 3545 model=model.abort_requester, 3546 decoder=decoder, 3547 config=config, 3548 name=f"job abort - {name}", 3549 ) 3550 if model.abort_requester 3551 else None 3552 ) 3553 delete_requester = ( 3554 self._create_component_from_model( 3555 model=model.delete_requester, 3556 decoder=decoder, 3557 config=config, 3558 name=f"job delete - {name}", 3559 ) 3560 if model.delete_requester 3561 else None 3562 ) 3563 download_target_requester = ( 3564 self._create_component_from_model( 3565 model=model.download_target_requester, 3566 decoder=decoder, 3567 config=config, 3568 name=f"job extract_url - {name}", 3569 ) 3570 if model.download_target_requester 3571 else None 3572 ) 3573 status_extractor = self._create_component_from_model( 3574 model=model.status_extractor, decoder=decoder, config=config, name=name 3575 ) 3576 download_target_extractor = self._create_component_from_model( 3577 model=model.download_target_extractor, 3578 decoder=decoder, 3579 config=config, 3580 name=name, 3581 ) 3582 3583 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3584 creation_requester=creation_requester, 3585 polling_requester=polling_requester, 3586 download_retriever=download_retriever, 3587 download_target_requester=download_target_requester, 3588 abort_requester=abort_requester, 3589 delete_requester=delete_requester, 3590 status_extractor=status_extractor, 3591 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3592 download_target_extractor=download_target_extractor, 3593 job_timeout=_get_job_timeout(), 3594 ) 3595 3596 async_job_partition_router = AsyncJobPartitionRouter( 3597 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3598 job_repository, 3599 stream_slices, 3600 self._job_tracker, 3601 self._message_repository, 3602 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3603 has_bulk_parent=False, 3604 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3605 # `None` == default retry is set to 3 attempts, under the hood. 3606 job_max_retry=1 if self._emit_connector_builder_messages else None, 3607 ), 3608 stream_slicer=stream_slicer, 3609 config=config, 3610 parameters=model.parameters or {}, 3611 ) 3612 3613 return AsyncRetriever( 3614 record_selector=record_selector, 3615 stream_slicer=async_job_partition_router, 3616 config=config, 3617 parameters=model.parameters or {}, 3618 )
3620 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3621 config_migrations = [ 3622 self._create_component_from_model(migration, config) 3623 for migration in ( 3624 model.config_normalization_rules.config_migrations 3625 if ( 3626 model.config_normalization_rules 3627 and model.config_normalization_rules.config_migrations 3628 ) 3629 else [] 3630 ) 3631 ] 3632 config_transformations = [ 3633 self._create_component_from_model(transformation, config) 3634 for transformation in ( 3635 model.config_normalization_rules.transformations 3636 if ( 3637 model.config_normalization_rules 3638 and model.config_normalization_rules.transformations 3639 ) 3640 else [] 3641 ) 3642 ] 3643 config_validations = [ 3644 self._create_component_from_model(validation, config) 3645 for validation in ( 3646 model.config_normalization_rules.validations 3647 if ( 3648 model.config_normalization_rules 3649 and model.config_normalization_rules.validations 3650 ) 3651 else [] 3652 ) 3653 ] 3654 3655 return Spec( 3656 connection_specification=model.connection_specification, 3657 documentation_url=model.documentation_url, 3658 advanced_auth=model.advanced_auth, 3659 parameters={}, 3660 config_migrations=config_migrations, 3661 config_transformations=config_transformations, 3662 config_validations=config_validations, 3663 )
3665 def create_substream_partition_router( 3666 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3667 ) -> SubstreamPartitionRouter: 3668 parent_stream_configs = [] 3669 if model.parent_stream_configs: 3670 parent_stream_configs.extend( 3671 [ 3672 self._create_message_repository_substream_wrapper( 3673 model=parent_stream_config, config=config, **kwargs 3674 ) 3675 for parent_stream_config in model.parent_stream_configs 3676 ] 3677 ) 3678 3679 return SubstreamPartitionRouter( 3680 parent_stream_configs=parent_stream_configs, 3681 parameters=model.parameters or {}, 3682 config=config, 3683 )
3711 @staticmethod 3712 def create_wait_time_from_header( 3713 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3714 ) -> WaitTimeFromHeaderBackoffStrategy: 3715 return WaitTimeFromHeaderBackoffStrategy( 3716 header=model.header, 3717 parameters=model.parameters or {}, 3718 config=config, 3719 regex=model.regex, 3720 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3721 if model.max_waiting_time_in_seconds is not None 3722 else None, 3723 )
3725 @staticmethod 3726 def create_wait_until_time_from_header( 3727 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3728 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3729 return WaitUntilTimeFromHeaderBackoffStrategy( 3730 header=model.header, 3731 parameters=model.parameters or {}, 3732 config=config, 3733 min_wait=model.min_wait, 3734 regex=model.regex, 3735 )
3743 @staticmethod 3744 def create_components_mapping_definition( 3745 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3746 ) -> ComponentMappingDefinition: 3747 interpolated_value = InterpolatedString.create( 3748 model.value, parameters=model.parameters or {} 3749 ) 3750 field_path = [ 3751 InterpolatedString.create(path, parameters=model.parameters or {}) 3752 for path in model.field_path 3753 ] 3754 return ComponentMappingDefinition( 3755 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3756 value=interpolated_value, 3757 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3758 create_or_update=model.create_or_update, 3759 parameters=model.parameters or {}, 3760 )
3762 def create_http_components_resolver( 3763 self, model: HttpComponentsResolverModel, config: Config 3764 ) -> Any: 3765 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3766 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3767 3768 retriever = self._create_component_from_model( 3769 model=model.retriever, 3770 config=config, 3771 name="", 3772 primary_key=None, 3773 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3774 transformations=[], 3775 ) 3776 3777 components_mapping = [ 3778 self._create_component_from_model( 3779 model=components_mapping_definition_model, 3780 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3781 components_mapping_definition_model.value_type 3782 ), 3783 config=config, 3784 ) 3785 for components_mapping_definition_model in model.components_mapping 3786 ] 3787 3788 return HttpComponentsResolver( 3789 retriever=retriever, 3790 config=config, 3791 components_mapping=components_mapping, 3792 parameters=model.parameters or {}, 3793 )
3795 @staticmethod 3796 def create_stream_config( 3797 model: StreamConfigModel, config: Config, **kwargs: Any 3798 ) -> StreamConfig: 3799 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3800 [x for x in model.configs_pointer] if model.configs_pointer else [] 3801 ) 3802 3803 return StreamConfig( 3804 configs_pointer=model_configs_pointer, 3805 default_values=model.default_values, 3806 parameters=model.parameters or {}, 3807 )
3809 def create_config_components_resolver( 3810 self, model: ConfigComponentsResolverModel, config: Config 3811 ) -> Any: 3812 model_stream_configs = ( 3813 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3814 ) 3815 3816 stream_configs = [ 3817 self._create_component_from_model( 3818 stream_config, config=config, parameters=model.parameters or {} 3819 ) 3820 for stream_config in model_stream_configs 3821 ] 3822 3823 components_mapping = [ 3824 self._create_component_from_model( 3825 model=components_mapping_definition_model, 3826 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3827 components_mapping_definition_model.value_type 3828 ), 3829 config=config, 3830 ) 3831 for components_mapping_definition_model in model.components_mapping 3832 ] 3833 3834 return ConfigComponentsResolver( 3835 stream_configs=stream_configs, 3836 config=config, 3837 components_mapping=components_mapping, 3838 parameters=model.parameters or {}, 3839 )
3863 def create_http_api_budget( 3864 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3865 ) -> HttpAPIBudget: 3866 policies = [ 3867 self._create_component_from_model(model=policy, config=config) 3868 for policy in model.policies 3869 ] 3870 3871 return HttpAPIBudget( 3872 policies=policies, 3873 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 3874 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 3875 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 3876 )
3878 def create_fixed_window_call_rate_policy( 3879 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 3880 ) -> FixedWindowCallRatePolicy: 3881 matchers = [ 3882 self._create_component_from_model(model=matcher, config=config) 3883 for matcher in model.matchers 3884 ] 3885 3886 # Set the initial reset timestamp to 10 days from now. 3887 # This value will be updated by the first request. 3888 return FixedWindowCallRatePolicy( 3889 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 3890 period=parse_duration(model.period), 3891 call_limit=model.call_limit, 3892 matchers=matchers, 3893 )
3895 def create_file_uploader( 3896 self, model: FileUploaderModel, config: Config, **kwargs: Any 3897 ) -> FileUploader: 3898 name = "File Uploader" 3899 requester = self._create_component_from_model( 3900 model=model.requester, 3901 config=config, 3902 name=name, 3903 **kwargs, 3904 ) 3905 download_target_extractor = self._create_component_from_model( 3906 model=model.download_target_extractor, 3907 config=config, 3908 name=name, 3909 **kwargs, 3910 ) 3911 emit_connector_builder_messages = self._emit_connector_builder_messages 3912 file_uploader = DefaultFileUploader( 3913 requester=requester, 3914 download_target_extractor=download_target_extractor, 3915 config=config, 3916 file_writer=NoopFileWriter() 3917 if emit_connector_builder_messages 3918 else LocalFileSystemFileWriter(), 3919 parameters=model.parameters or {}, 3920 filename_extractor=model.filename_extractor if model.filename_extractor else None, 3921 ) 3922 3923 return ( 3924 ConnectorBuilderFileUploader(file_uploader) 3925 if emit_connector_builder_messages 3926 else file_uploader 3927 )
3929 def create_moving_window_call_rate_policy( 3930 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 3931 ) -> MovingWindowCallRatePolicy: 3932 rates = [ 3933 self._create_component_from_model(model=rate, config=config) for rate in model.rates 3934 ] 3935 matchers = [ 3936 self._create_component_from_model(model=matcher, config=config) 3937 for matcher in model.matchers 3938 ] 3939 return MovingWindowCallRatePolicy( 3940 rates=rates, 3941 matchers=matchers, 3942 )
3944 def create_unlimited_call_rate_policy( 3945 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 3946 ) -> UnlimitedCallRatePolicy: 3947 matchers = [ 3948 self._create_component_from_model(model=matcher, config=config) 3949 for matcher in model.matchers 3950 ] 3951 3952 return UnlimitedCallRatePolicy( 3953 matchers=matchers, 3954 )
3963 def create_http_request_matcher( 3964 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 3965 ) -> HttpRequestRegexMatcher: 3966 return HttpRequestRegexMatcher( 3967 method=model.method, 3968 url_base=model.url_base, 3969 url_path_pattern=model.url_path_pattern, 3970 params=model.params, 3971 headers=model.headers, 3972 )
3979 def create_grouping_partition_router( 3980 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 3981 ) -> GroupingPartitionRouter: 3982 underlying_router = self._create_component_from_model( 3983 model=model.underlying_partition_router, config=config 3984 ) 3985 if model.group_size < 1: 3986 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 3987 3988 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 3989 # because they are specific to individual partitions and cannot be aggregated or handled 3990 # when grouping, potentially leading to incorrect API calls. Any request customization 3991 # should be managed at the stream level through the requester's configuration. 3992 if isinstance(underlying_router, SubstreamPartitionRouter): 3993 if any( 3994 parent_config.request_option 3995 for parent_config in underlying_router.parent_stream_configs 3996 ): 3997 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 3998 3999 if isinstance(underlying_router, ListPartitionRouter): 4000 if underlying_router.request_option: 4001 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4002 4003 return GroupingPartitionRouter( 4004 group_size=model.group_size, 4005 underlying_partition_router=underlying_router, 4006 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4007 config=config, 4008 )