airbyte_cdk.sources.declarative.parsers.model_to_component_factory
1# 2# Copyright (c) 2025 Airbyte, Inc., all rights reserved. 3# 4 5from __future__ import annotations 6 7import datetime 8import importlib 9import inspect 10import re 11from functools import partial 12from typing import ( 13 Any, 14 Callable, 15 Dict, 16 List, 17 Mapping, 18 MutableMapping, 19 Optional, 20 Type, 21 Union, 22 cast, 23 get_args, 24 get_origin, 25 get_type_hints, 26) 27 28from isodate import parse_duration 29from pydantic.v1 import BaseModel 30from requests import Response 31 32from airbyte_cdk.connector_builder.models import ( 33 LogMessage as ConnectorBuilderLogMessage, 34) 35from airbyte_cdk.models import FailureType, Level 36from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager 37from airbyte_cdk.sources.declarative import transformations 38from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator 39from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker 40from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository 41from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus 42from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator 43from airbyte_cdk.sources.declarative.auth.declarative_authenticator import ( 44 DeclarativeAuthenticator, 45 NoAuth, 46) 47from airbyte_cdk.sources.declarative.auth.jwt import JwtAlgorithm 48from airbyte_cdk.sources.declarative.auth.oauth import ( 49 DeclarativeSingleUseRefreshTokenOauth2Authenticator, 50) 51from airbyte_cdk.sources.declarative.auth.selective_authenticator import SelectiveAuthenticator 52from airbyte_cdk.sources.declarative.auth.token import ( 53 ApiKeyAuthenticator, 54 BasicHttpAuthenticator, 55 BearerAuthenticator, 56 LegacySessionTokenAuthenticator, 57) 58from airbyte_cdk.sources.declarative.auth.token_provider import ( 59 InterpolatedStringTokenProvider, 60 SessionTokenProvider, 61 TokenProvider, 62) 63from airbyte_cdk.sources.declarative.checks import ( 64 CheckDynamicStream, 65 CheckStream, 66 DynamicStreamCheckConfig, 67) 68from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel 69from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime 70from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream 71from airbyte_cdk.sources.declarative.decoders import ( 72 Decoder, 73 IterableDecoder, 74 JsonDecoder, 75 PaginationDecoderDecorator, 76 XmlDecoder, 77 ZipfileDecoder, 78) 79from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import ( 80 CompositeRawDecoder, 81 CsvParser, 82 GzipParser, 83 JsonLineParser, 84 JsonParser, 85 Parser, 86) 87from airbyte_cdk.sources.declarative.extractors import ( 88 DpathExtractor, 89 RecordFilter, 90 RecordSelector, 91 ResponseToFileExtractor, 92) 93from airbyte_cdk.sources.declarative.extractors.record_filter import ( 94 ClientSideIncrementalRecordFilterDecorator, 95) 96from airbyte_cdk.sources.declarative.incremental import ( 97 ChildPartitionResumableFullRefreshCursor, 98 ConcurrentCursorFactory, 99 ConcurrentPerPartitionCursor, 100 CursorFactory, 101 DatetimeBasedCursor, 102 DeclarativeCursor, 103 GlobalSubstreamCursor, 104 PerPartitionCursor, 105 PerPartitionWithGlobalCursor, 106 ResumableFullRefreshCursor, 107) 108from airbyte_cdk.sources.declarative.interpolation import InterpolatedString 109from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping 110from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import ( 111 LegacyToPerPartitionStateMigration, 112) 113from airbyte_cdk.sources.declarative.models import ( 114 CustomStateMigration, 115) 116from airbyte_cdk.sources.declarative.models.base_model_with_deprecations import ( 117 DEPRECATION_LOGS_TAG, 118 BaseModelWithDeprecations, 119) 120from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 121 AddedFieldDefinition as AddedFieldDefinitionModel, 122) 123from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 124 AddFields as AddFieldsModel, 125) 126from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 127 ApiKeyAuthenticator as ApiKeyAuthenticatorModel, 128) 129from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 130 AsyncJobStatusMap as AsyncJobStatusMapModel, 131) 132from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 133 AsyncRetriever as AsyncRetrieverModel, 134) 135from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 136 BasicHttpAuthenticator as BasicHttpAuthenticatorModel, 137) 138from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 139 BearerAuthenticator as BearerAuthenticatorModel, 140) 141from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 142 CheckDynamicStream as CheckDynamicStreamModel, 143) 144from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 145 CheckStream as CheckStreamModel, 146) 147from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 148 ComplexFieldType as ComplexFieldTypeModel, 149) 150from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 151 ComponentMappingDefinition as ComponentMappingDefinitionModel, 152) 153from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 154 CompositeErrorHandler as CompositeErrorHandlerModel, 155) 156from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 157 ConcurrencyLevel as ConcurrencyLevelModel, 158) 159from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 160 ConfigAddFields as ConfigAddFieldsModel, 161) 162from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 163 ConfigComponentsResolver as ConfigComponentsResolverModel, 164) 165from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 166 ConfigMigration as ConfigMigrationModel, 167) 168from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 169 ConfigRemapField as ConfigRemapFieldModel, 170) 171from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 172 ConfigRemoveFields as ConfigRemoveFieldsModel, 173) 174from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 175 ConstantBackoffStrategy as ConstantBackoffStrategyModel, 176) 177from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 178 CsvDecoder as CsvDecoderModel, 179) 180from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 181 CursorPagination as CursorPaginationModel, 182) 183from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 184 CustomAuthenticator as CustomAuthenticatorModel, 185) 186from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 187 CustomBackoffStrategy as CustomBackoffStrategyModel, 188) 189from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 190 CustomDecoder as CustomDecoderModel, 191) 192from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 193 CustomErrorHandler as CustomErrorHandlerModel, 194) 195from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 196 CustomIncrementalSync as CustomIncrementalSyncModel, 197) 198from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 199 CustomPaginationStrategy as CustomPaginationStrategyModel, 200) 201from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 202 CustomPartitionRouter as CustomPartitionRouterModel, 203) 204from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 205 CustomRecordExtractor as CustomRecordExtractorModel, 206) 207from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 208 CustomRecordFilter as CustomRecordFilterModel, 209) 210from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 211 CustomRequester as CustomRequesterModel, 212) 213from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 214 CustomRetriever as CustomRetrieverModel, 215) 216from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 217 CustomSchemaLoader as CustomSchemaLoader, 218) 219from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 220 CustomSchemaNormalization as CustomSchemaNormalizationModel, 221) 222from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 223 CustomTransformation as CustomTransformationModel, 224) 225from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 226 CustomValidationStrategy as CustomValidationStrategyModel, 227) 228from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 229 DatetimeBasedCursor as DatetimeBasedCursorModel, 230) 231from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 232 DeclarativeStream as DeclarativeStreamModel, 233) 234from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 235 DefaultErrorHandler as DefaultErrorHandlerModel, 236) 237from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 238 DefaultPaginator as DefaultPaginatorModel, 239) 240from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 241 DpathExtractor as DpathExtractorModel, 242) 243from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 244 DpathFlattenFields as DpathFlattenFieldsModel, 245) 246from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 247 DpathValidator as DpathValidatorModel, 248) 249from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 250 DynamicSchemaLoader as DynamicSchemaLoaderModel, 251) 252from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 253 DynamicStreamCheckConfig as DynamicStreamCheckConfigModel, 254) 255from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 256 ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, 257) 258from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 259 FileUploader as FileUploaderModel, 260) 261from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 262 FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel, 263) 264from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 265 FlattenFields as FlattenFieldsModel, 266) 267from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 268 GroupByKeyMergeStrategy as GroupByKeyMergeStrategyModel, 269) 270from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 271 GroupingPartitionRouter as GroupingPartitionRouterModel, 272) 273from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 274 GzipDecoder as GzipDecoderModel, 275) 276from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 277 HTTPAPIBudget as HTTPAPIBudgetModel, 278) 279from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 280 HttpComponentsResolver as HttpComponentsResolverModel, 281) 282from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 283 HttpRequester as HttpRequesterModel, 284) 285from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 286 HttpRequestRegexMatcher as HttpRequestRegexMatcherModel, 287) 288from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 289 HttpResponseFilter as HttpResponseFilterModel, 290) 291from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 292 IncrementingCountCursor as IncrementingCountCursorModel, 293) 294from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 295 InlineSchemaLoader as InlineSchemaLoaderModel, 296) 297from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 298 IterableDecoder as IterableDecoderModel, 299) 300from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 301 JsonDecoder as JsonDecoderModel, 302) 303from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 304 JsonFileSchemaLoader as JsonFileSchemaLoaderModel, 305) 306from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 307 JsonlDecoder as JsonlDecoderModel, 308) 309from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 310 JwtAuthenticator as JwtAuthenticatorModel, 311) 312from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 313 JwtHeaders as JwtHeadersModel, 314) 315from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 316 JwtPayload as JwtPayloadModel, 317) 318from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 319 KeysReplace as KeysReplaceModel, 320) 321from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 322 KeysToLower as KeysToLowerModel, 323) 324from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 325 KeysToSnakeCase as KeysToSnakeCaseModel, 326) 327from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 328 LegacySessionTokenAuthenticator as LegacySessionTokenAuthenticatorModel, 329) 330from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 331 LegacyToPerPartitionStateMigration as LegacyToPerPartitionStateMigrationModel, 332) 333from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 334 ListPartitionRouter as ListPartitionRouterModel, 335) 336from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 337 MinMaxDatetime as MinMaxDatetimeModel, 338) 339from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 340 MovingWindowCallRatePolicy as MovingWindowCallRatePolicyModel, 341) 342from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 343 NoAuth as NoAuthModel, 344) 345from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 346 NoPagination as NoPaginationModel, 347) 348from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 349 OAuthAuthenticator as OAuthAuthenticatorModel, 350) 351from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 352 OffsetIncrement as OffsetIncrementModel, 353) 354from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 355 PageIncrement as PageIncrementModel, 356) 357from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 358 ParametrizedComponentsResolver as ParametrizedComponentsResolverModel, 359) 360from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 361 ParentStreamConfig as ParentStreamConfigModel, 362) 363from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 364 PredicateValidator as PredicateValidatorModel, 365) 366from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 367 PropertiesFromEndpoint as PropertiesFromEndpointModel, 368) 369from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 370 PropertyChunking as PropertyChunkingModel, 371) 372from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 373 PropertyLimitType as PropertyLimitTypeModel, 374) 375from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 376 QueryProperties as QueryPropertiesModel, 377) 378from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 379 Rate as RateModel, 380) 381from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 382 RecordFilter as RecordFilterModel, 383) 384from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 385 RecordSelector as RecordSelectorModel, 386) 387from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 388 RemoveFields as RemoveFieldsModel, 389) 390from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 391 RequestOption as RequestOptionModel, 392) 393from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 394 RequestPath as RequestPathModel, 395) 396from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 397 ResponseToFileExtractor as ResponseToFileExtractorModel, 398) 399from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 400 SchemaNormalization as SchemaNormalizationModel, 401) 402from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 403 SchemaTypeIdentifier as SchemaTypeIdentifierModel, 404) 405from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 406 SelectiveAuthenticator as SelectiveAuthenticatorModel, 407) 408from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 409 SessionTokenAuthenticator as SessionTokenAuthenticatorModel, 410) 411from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 412 SimpleRetriever as SimpleRetrieverModel, 413) 414from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel 415from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 416 StateDelegatingStream as StateDelegatingStreamModel, 417) 418from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 419 StreamConfig as StreamConfigModel, 420) 421from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 422 SubstreamPartitionRouter as SubstreamPartitionRouterModel, 423) 424from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 425 TypesMap as TypesMapModel, 426) 427from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 428 UnlimitedCallRatePolicy as UnlimitedCallRatePolicyModel, 429) 430from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 431 ValidateAdheresToSchema as ValidateAdheresToSchemaModel, 432) 433from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType 434from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 435 WaitTimeFromHeader as WaitTimeFromHeaderModel, 436) 437from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 438 WaitUntilTimeFromHeader as WaitUntilTimeFromHeaderModel, 439) 440from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 441 XmlDecoder as XmlDecoderModel, 442) 443from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 444 ZipfileDecoder as ZipfileDecoderModel, 445) 446from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import ( 447 COMPONENTS_MODULE_NAME, 448 SDM_COMPONENTS_MODULE_NAME, 449) 450from airbyte_cdk.sources.declarative.partition_routers import ( 451 CartesianProductStreamSlicer, 452 GroupingPartitionRouter, 453 ListPartitionRouter, 454 PartitionRouter, 455 SinglePartitionRouter, 456 SubstreamPartitionRouter, 457) 458from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import ( 459 AsyncJobPartitionRouter, 460) 461from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( 462 ParentStreamConfig, 463) 464from airbyte_cdk.sources.declarative.requesters import HttpRequester, RequestOption 465from airbyte_cdk.sources.declarative.requesters.error_handlers import ( 466 CompositeErrorHandler, 467 DefaultErrorHandler, 468 HttpResponseFilter, 469) 470from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ( 471 ConstantBackoffStrategy, 472 ExponentialBackoffStrategy, 473 WaitTimeFromHeaderBackoffStrategy, 474 WaitUntilTimeFromHeaderBackoffStrategy, 475) 476from airbyte_cdk.sources.declarative.requesters.http_job_repository import AsyncHttpJobRepository 477from airbyte_cdk.sources.declarative.requesters.paginators import ( 478 DefaultPaginator, 479 NoPagination, 480 PaginatorTestReadDecorator, 481) 482from airbyte_cdk.sources.declarative.requesters.paginators.strategies import ( 483 CursorPaginationStrategy, 484 CursorStopCondition, 485 OffsetIncrement, 486 PageIncrement, 487 StopConditionPaginationStrategyDecorator, 488) 489from airbyte_cdk.sources.declarative.requesters.query_properties import ( 490 PropertiesFromEndpoint, 491 PropertyChunking, 492 QueryProperties, 493) 494from airbyte_cdk.sources.declarative.requesters.query_properties.property_chunking import ( 495 PropertyLimitType, 496) 497from airbyte_cdk.sources.declarative.requesters.query_properties.strategies import ( 498 GroupByKey, 499) 500from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType 501from airbyte_cdk.sources.declarative.requesters.request_options import ( 502 DatetimeBasedRequestOptionsProvider, 503 DefaultRequestOptionsProvider, 504 InterpolatedRequestOptionsProvider, 505 RequestOptionsProvider, 506) 507from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath 508from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester 509from airbyte_cdk.sources.declarative.resolvers import ( 510 ComponentMappingDefinition, 511 ConfigComponentsResolver, 512 HttpComponentsResolver, 513 ParametrizedComponentsResolver, 514 StreamConfig, 515 StreamParametersDefinition, 516) 517from airbyte_cdk.sources.declarative.retrievers import ( 518 AsyncRetriever, 519 LazySimpleRetriever, 520 SimpleRetriever, 521) 522from airbyte_cdk.sources.declarative.retrievers.file_uploader import ( 523 ConnectorBuilderFileUploader, 524 DefaultFileUploader, 525 FileUploader, 526 LocalFileSystemFileWriter, 527 NoopFileWriter, 528) 529from airbyte_cdk.sources.declarative.schema import ( 530 ComplexFieldType, 531 DefaultSchemaLoader, 532 DynamicSchemaLoader, 533 InlineSchemaLoader, 534 JsonFileSchemaLoader, 535 SchemaTypeIdentifier, 536 TypesMap, 537) 538from airbyte_cdk.sources.declarative.schema.composite_schema_loader import CompositeSchemaLoader 539from airbyte_cdk.sources.declarative.spec import ConfigMigration, Spec 540from airbyte_cdk.sources.declarative.stream_slicers import ( 541 StreamSlicer, 542 StreamSlicerTestReadDecorator, 543) 544from airbyte_cdk.sources.declarative.transformations import ( 545 AddFields, 546 RecordTransformation, 547 RemoveFields, 548) 549from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition 550from airbyte_cdk.sources.declarative.transformations.config_transformations import ( 551 ConfigAddFields, 552 ConfigRemapField, 553 ConfigRemoveFields, 554) 555from airbyte_cdk.sources.declarative.transformations.config_transformations.config_transformation import ( 556 ConfigTransformation, 557) 558from airbyte_cdk.sources.declarative.transformations.dpath_flatten_fields import ( 559 DpathFlattenFields, 560 KeyTransformation, 561) 562from airbyte_cdk.sources.declarative.transformations.flatten_fields import ( 563 FlattenFields, 564) 565from airbyte_cdk.sources.declarative.transformations.keys_replace_transformation import ( 566 KeysReplaceTransformation, 567) 568from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import ( 569 KeysToLowerTransformation, 570) 571from airbyte_cdk.sources.declarative.transformations.keys_to_snake_transformation import ( 572 KeysToSnakeCaseTransformation, 573) 574from airbyte_cdk.sources.declarative.validators import ( 575 DpathValidator, 576 PredicateValidator, 577 ValidateAdheresToSchema, 578) 579from airbyte_cdk.sources.http_logger import format_http_message 580from airbyte_cdk.sources.message import ( 581 InMemoryMessageRepository, 582 LogAppenderMessageRepositoryDecorator, 583 MessageRepository, 584 NoopMessageRepository, 585) 586from airbyte_cdk.sources.streams.call_rate import ( 587 APIBudget, 588 FixedWindowCallRatePolicy, 589 HttpAPIBudget, 590 HttpRequestRegexMatcher, 591 MovingWindowCallRatePolicy, 592 Rate, 593 UnlimitedCallRatePolicy, 594) 595from airbyte_cdk.sources.streams.concurrent.clamping import ( 596 ClampingEndProvider, 597 ClampingStrategy, 598 DayClampingStrategy, 599 MonthClampingStrategy, 600 NoClamping, 601 WeekClampingStrategy, 602 Weekday, 603) 604from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField 605from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( 606 CustomFormatConcurrentStreamStateConverter, 607 DateTimeStreamStateConverter, 608) 609from airbyte_cdk.sources.streams.concurrent.state_converters.incrementing_count_stream_state_converter import ( 610 IncrementingCountStreamStateConverter, 611) 612from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction 613from airbyte_cdk.sources.types import Config 614from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer 615 616ComponentDefinition = Mapping[str, Any] 617 618SCHEMA_TRANSFORMER_TYPE_MAPPING = { 619 SchemaNormalizationModel.None_: TransformConfig.NoTransform, 620 SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization, 621} 622 623 624class ModelToComponentFactory: 625 EPOCH_DATETIME_FORMAT = "%s" 626 627 def __init__( 628 self, 629 limit_pages_fetched_per_slice: Optional[int] = None, 630 limit_slices_fetched: Optional[int] = None, 631 emit_connector_builder_messages: bool = False, 632 disable_retries: bool = False, 633 disable_cache: bool = False, 634 disable_resumable_full_refresh: bool = False, 635 message_repository: Optional[MessageRepository] = None, 636 connector_state_manager: Optional[ConnectorStateManager] = None, 637 max_concurrent_async_job_count: Optional[int] = None, 638 ): 639 self._init_mappings() 640 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 641 self._limit_slices_fetched = limit_slices_fetched 642 self._emit_connector_builder_messages = emit_connector_builder_messages 643 self._disable_retries = disable_retries 644 self._disable_cache = disable_cache 645 self._disable_resumable_full_refresh = disable_resumable_full_refresh 646 self._message_repository = message_repository or InMemoryMessageRepository( 647 self._evaluate_log_level(emit_connector_builder_messages) 648 ) 649 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 650 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 651 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 652 # placeholder for deprecation warnings 653 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 654 655 def _init_mappings(self) -> None: 656 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 657 AddedFieldDefinitionModel: self.create_added_field_definition, 658 AddFieldsModel: self.create_add_fields, 659 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 660 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 661 BearerAuthenticatorModel: self.create_bearer_authenticator, 662 CheckStreamModel: self.create_check_stream, 663 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 664 CheckDynamicStreamModel: self.create_check_dynamic_stream, 665 CompositeErrorHandlerModel: self.create_composite_error_handler, 666 ConcurrencyLevelModel: self.create_concurrency_level, 667 ConfigMigrationModel: self.create_config_migration, 668 ConfigAddFieldsModel: self.create_config_add_fields, 669 ConfigRemapFieldModel: self.create_config_remap_field, 670 ConfigRemoveFieldsModel: self.create_config_remove_fields, 671 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 672 CsvDecoderModel: self.create_csv_decoder, 673 CursorPaginationModel: self.create_cursor_pagination, 674 CustomAuthenticatorModel: self.create_custom_component, 675 CustomBackoffStrategyModel: self.create_custom_component, 676 CustomDecoderModel: self.create_custom_component, 677 CustomErrorHandlerModel: self.create_custom_component, 678 CustomIncrementalSyncModel: self.create_custom_component, 679 CustomRecordExtractorModel: self.create_custom_component, 680 CustomRecordFilterModel: self.create_custom_component, 681 CustomRequesterModel: self.create_custom_component, 682 CustomRetrieverModel: self.create_custom_component, 683 CustomSchemaLoader: self.create_custom_component, 684 CustomSchemaNormalizationModel: self.create_custom_component, 685 CustomStateMigration: self.create_custom_component, 686 CustomPaginationStrategyModel: self.create_custom_component, 687 CustomPartitionRouterModel: self.create_custom_component, 688 CustomTransformationModel: self.create_custom_component, 689 CustomValidationStrategyModel: self.create_custom_component, 690 DatetimeBasedCursorModel: self.create_datetime_based_cursor, 691 DeclarativeStreamModel: self.create_declarative_stream, 692 DefaultErrorHandlerModel: self.create_default_error_handler, 693 DefaultPaginatorModel: self.create_default_paginator, 694 DpathExtractorModel: self.create_dpath_extractor, 695 DpathValidatorModel: self.create_dpath_validator, 696 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 697 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 698 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 699 GroupByKeyMergeStrategyModel: self.create_group_by_key, 700 HttpRequesterModel: self.create_http_requester, 701 HttpResponseFilterModel: self.create_http_response_filter, 702 InlineSchemaLoaderModel: self.create_inline_schema_loader, 703 JsonDecoderModel: self.create_json_decoder, 704 JsonlDecoderModel: self.create_jsonl_decoder, 705 GzipDecoderModel: self.create_gzip_decoder, 706 KeysToLowerModel: self.create_keys_to_lower_transformation, 707 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 708 KeysReplaceModel: self.create_keys_replace_transformation, 709 FlattenFieldsModel: self.create_flatten_fields, 710 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 711 IterableDecoderModel: self.create_iterable_decoder, 712 IncrementingCountCursorModel: self.create_incrementing_count_cursor, 713 XmlDecoderModel: self.create_xml_decoder, 714 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 715 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 716 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 717 TypesMapModel: self.create_types_map, 718 ComplexFieldTypeModel: self.create_complex_field_type, 719 JwtAuthenticatorModel: self.create_jwt_authenticator, 720 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 721 ListPartitionRouterModel: self.create_list_partition_router, 722 MinMaxDatetimeModel: self.create_min_max_datetime, 723 NoAuthModel: self.create_no_auth, 724 NoPaginationModel: self.create_no_pagination, 725 OAuthAuthenticatorModel: self.create_oauth_authenticator, 726 OffsetIncrementModel: self.create_offset_increment, 727 PageIncrementModel: self.create_page_increment, 728 ParentStreamConfigModel: self.create_parent_stream_config, 729 PredicateValidatorModel: self.create_predicate_validator, 730 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 731 PropertyChunkingModel: self.create_property_chunking, 732 QueryPropertiesModel: self.create_query_properties, 733 RecordFilterModel: self.create_record_filter, 734 RecordSelectorModel: self.create_record_selector, 735 RemoveFieldsModel: self.create_remove_fields, 736 RequestPathModel: self.create_request_path, 737 RequestOptionModel: self.create_request_option, 738 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 739 SelectiveAuthenticatorModel: self.create_selective_authenticator, 740 SimpleRetrieverModel: self.create_simple_retriever, 741 StateDelegatingStreamModel: self.create_state_delegating_stream, 742 SpecModel: self.create_spec, 743 SubstreamPartitionRouterModel: self.create_substream_partition_router, 744 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 745 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 746 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 747 AsyncRetrieverModel: self.create_async_retriever, 748 HttpComponentsResolverModel: self.create_http_components_resolver, 749 ConfigComponentsResolverModel: self.create_config_components_resolver, 750 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 751 StreamConfigModel: self.create_stream_config, 752 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 753 ZipfileDecoderModel: self.create_zipfile_decoder, 754 HTTPAPIBudgetModel: self.create_http_api_budget, 755 FileUploaderModel: self.create_file_uploader, 756 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 757 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 758 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 759 RateModel: self.create_rate, 760 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 761 GroupingPartitionRouterModel: self.create_grouping_partition_router, 762 } 763 764 # Needed for the case where we need to perform a second parse on the fields of a custom component 765 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 766 767 def create_component( 768 self, 769 model_type: Type[BaseModel], 770 component_definition: ComponentDefinition, 771 config: Config, 772 **kwargs: Any, 773 ) -> Any: 774 """ 775 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 776 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 777 creating declarative components from that model. 778 779 :param model_type: The type of declarative component that is being initialized 780 :param component_definition: The mapping that represents a declarative component 781 :param config: The connector config that is provided by the customer 782 :return: The declarative component to be used at runtime 783 """ 784 785 component_type = component_definition.get("type") 786 if component_definition.get("type") != model_type.__name__: 787 raise ValueError( 788 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 789 ) 790 791 declarative_component_model = model_type.parse_obj(component_definition) 792 793 if not isinstance(declarative_component_model, model_type): 794 raise ValueError( 795 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 796 ) 797 798 return self._create_component_from_model( 799 model=declarative_component_model, config=config, **kwargs 800 ) 801 802 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 803 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 804 raise ValueError( 805 f"{model.__class__} with attributes {model} is not a valid component type" 806 ) 807 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 808 if not component_constructor: 809 raise ValueError(f"Could not find constructor for {model.__class__}") 810 811 # collect deprecation warnings for supported models. 812 if isinstance(model, BaseModelWithDeprecations): 813 self._collect_model_deprecations(model) 814 815 return component_constructor(model=model, config=config, **kwargs) 816 817 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 818 """ 819 Returns the deprecation warnings that were collected during the creation of components. 820 """ 821 return self._collected_deprecation_logs 822 823 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 824 """ 825 Collects deprecation logs from the given model and appends any new logs to the internal collection. 826 827 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 828 829 Args: 830 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 831 """ 832 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 833 for log in model._deprecation_logs: 834 # avoid duplicates for deprecation logs observed. 835 if log not in self._collected_deprecation_logs: 836 self._collected_deprecation_logs.append(log) 837 838 def create_config_migration( 839 self, model: ConfigMigrationModel, config: Config 840 ) -> ConfigMigration: 841 transformations: List[ConfigTransformation] = [ 842 self._create_component_from_model(transformation, config) 843 for transformation in model.transformations 844 ] 845 846 return ConfigMigration( 847 description=model.description, 848 transformations=transformations, 849 ) 850 851 def create_config_add_fields( 852 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 853 ) -> ConfigAddFields: 854 fields = [self._create_component_from_model(field, config) for field in model.fields] 855 return ConfigAddFields( 856 fields=fields, 857 condition=model.condition or "", 858 ) 859 860 @staticmethod 861 def create_config_remove_fields( 862 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 863 ) -> ConfigRemoveFields: 864 return ConfigRemoveFields( 865 field_pointers=model.field_pointers, 866 condition=model.condition or "", 867 ) 868 869 @staticmethod 870 def create_config_remap_field( 871 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 872 ) -> ConfigRemapField: 873 mapping = cast(Mapping[str, Any], model.map) 874 return ConfigRemapField( 875 map=mapping, 876 field_path=model.field_path, 877 config=config, 878 ) 879 880 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 881 strategy = self._create_component_from_model(model.validation_strategy, config) 882 883 return DpathValidator( 884 field_path=model.field_path, 885 strategy=strategy, 886 ) 887 888 def create_predicate_validator( 889 self, model: PredicateValidatorModel, config: Config 890 ) -> PredicateValidator: 891 strategy = self._create_component_from_model(model.validation_strategy, config) 892 893 return PredicateValidator( 894 value=model.value, 895 strategy=strategy, 896 ) 897 898 @staticmethod 899 def create_validate_adheres_to_schema( 900 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 901 ) -> ValidateAdheresToSchema: 902 base_schema = cast(Mapping[str, Any], model.base_schema) 903 return ValidateAdheresToSchema( 904 schema=base_schema, 905 ) 906 907 @staticmethod 908 def create_added_field_definition( 909 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 910 ) -> AddedFieldDefinition: 911 interpolated_value = InterpolatedString.create( 912 model.value, parameters=model.parameters or {} 913 ) 914 return AddedFieldDefinition( 915 path=model.path, 916 value=interpolated_value, 917 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 918 parameters=model.parameters or {}, 919 ) 920 921 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 922 added_field_definitions = [ 923 self._create_component_from_model( 924 model=added_field_definition_model, 925 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 926 added_field_definition_model.value_type 927 ), 928 config=config, 929 ) 930 for added_field_definition_model in model.fields 931 ] 932 return AddFields( 933 fields=added_field_definitions, 934 condition=model.condition or "", 935 parameters=model.parameters or {}, 936 ) 937 938 def create_keys_to_lower_transformation( 939 self, model: KeysToLowerModel, config: Config, **kwargs: Any 940 ) -> KeysToLowerTransformation: 941 return KeysToLowerTransformation() 942 943 def create_keys_to_snake_transformation( 944 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 945 ) -> KeysToSnakeCaseTransformation: 946 return KeysToSnakeCaseTransformation() 947 948 def create_keys_replace_transformation( 949 self, model: KeysReplaceModel, config: Config, **kwargs: Any 950 ) -> KeysReplaceTransformation: 951 return KeysReplaceTransformation( 952 old=model.old, new=model.new, parameters=model.parameters or {} 953 ) 954 955 def create_flatten_fields( 956 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 957 ) -> FlattenFields: 958 return FlattenFields( 959 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 960 ) 961 962 def create_dpath_flatten_fields( 963 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 964 ) -> DpathFlattenFields: 965 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 966 key_transformation = ( 967 KeyTransformation( 968 config=config, 969 prefix=model.key_transformation.prefix, 970 suffix=model.key_transformation.suffix, 971 parameters=model.parameters or {}, 972 ) 973 if model.key_transformation is not None 974 else None 975 ) 976 return DpathFlattenFields( 977 config=config, 978 field_path=model_field_path, 979 delete_origin_value=model.delete_origin_value 980 if model.delete_origin_value is not None 981 else False, 982 replace_record=model.replace_record if model.replace_record is not None else False, 983 key_transformation=key_transformation, 984 parameters=model.parameters or {}, 985 ) 986 987 @staticmethod 988 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 989 if not value_type: 990 return None 991 names_to_types = { 992 ValueType.string: str, 993 ValueType.number: float, 994 ValueType.integer: int, 995 ValueType.boolean: bool, 996 } 997 return names_to_types[value_type] 998 999 def create_api_key_authenticator( 1000 self, 1001 model: ApiKeyAuthenticatorModel, 1002 config: Config, 1003 token_provider: Optional[TokenProvider] = None, 1004 **kwargs: Any, 1005 ) -> ApiKeyAuthenticator: 1006 if model.inject_into is None and model.header is None: 1007 raise ValueError( 1008 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1009 ) 1010 1011 if model.inject_into is not None and model.header is not None: 1012 raise ValueError( 1013 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1014 ) 1015 1016 if token_provider is not None and model.api_token != "": 1017 raise ValueError( 1018 "If token_provider is set, api_token is ignored and has to be set to empty string." 1019 ) 1020 1021 request_option = ( 1022 self._create_component_from_model( 1023 model.inject_into, config, parameters=model.parameters or {} 1024 ) 1025 if model.inject_into 1026 else RequestOption( 1027 inject_into=RequestOptionType.header, 1028 field_name=model.header or "", 1029 parameters=model.parameters or {}, 1030 ) 1031 ) 1032 1033 return ApiKeyAuthenticator( 1034 token_provider=( 1035 token_provider 1036 if token_provider is not None 1037 else InterpolatedStringTokenProvider( 1038 api_token=model.api_token or "", 1039 config=config, 1040 parameters=model.parameters or {}, 1041 ) 1042 ), 1043 request_option=request_option, 1044 config=config, 1045 parameters=model.parameters or {}, 1046 ) 1047 1048 def create_legacy_to_per_partition_state_migration( 1049 self, 1050 model: LegacyToPerPartitionStateMigrationModel, 1051 config: Mapping[str, Any], 1052 declarative_stream: DeclarativeStreamModel, 1053 ) -> LegacyToPerPartitionStateMigration: 1054 retriever = declarative_stream.retriever 1055 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1056 raise ValueError( 1057 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1058 ) 1059 partition_router = retriever.partition_router 1060 if not isinstance( 1061 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1062 ): 1063 raise ValueError( 1064 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1065 ) 1066 if not hasattr(partition_router, "parent_stream_configs"): 1067 raise ValueError( 1068 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1069 ) 1070 1071 if not hasattr(declarative_stream, "incremental_sync"): 1072 raise ValueError( 1073 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1074 ) 1075 1076 return LegacyToPerPartitionStateMigration( 1077 partition_router, # type: ignore # was already checked above 1078 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1079 config, 1080 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1081 ) 1082 1083 def create_session_token_authenticator( 1084 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1085 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1086 decoder = ( 1087 self._create_component_from_model(model=model.decoder, config=config) 1088 if model.decoder 1089 else JsonDecoder(parameters={}) 1090 ) 1091 login_requester = self._create_component_from_model( 1092 model=model.login_requester, 1093 config=config, 1094 name=f"{name}_login_requester", 1095 decoder=decoder, 1096 ) 1097 token_provider = SessionTokenProvider( 1098 login_requester=login_requester, 1099 session_token_path=model.session_token_path, 1100 expiration_duration=parse_duration(model.expiration_duration) 1101 if model.expiration_duration 1102 else None, 1103 parameters=model.parameters or {}, 1104 message_repository=self._message_repository, 1105 decoder=decoder, 1106 ) 1107 if model.request_authentication.type == "Bearer": 1108 return ModelToComponentFactory.create_bearer_authenticator( 1109 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1110 config, 1111 token_provider=token_provider, 1112 ) 1113 else: 1114 return self.create_api_key_authenticator( 1115 ApiKeyAuthenticatorModel( 1116 type="ApiKeyAuthenticator", 1117 api_token="", 1118 inject_into=model.request_authentication.inject_into, 1119 ), # type: ignore # $parameters and headers default to None 1120 config=config, 1121 token_provider=token_provider, 1122 ) 1123 1124 @staticmethod 1125 def create_basic_http_authenticator( 1126 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1127 ) -> BasicHttpAuthenticator: 1128 return BasicHttpAuthenticator( 1129 password=model.password or "", 1130 username=model.username, 1131 config=config, 1132 parameters=model.parameters or {}, 1133 ) 1134 1135 @staticmethod 1136 def create_bearer_authenticator( 1137 model: BearerAuthenticatorModel, 1138 config: Config, 1139 token_provider: Optional[TokenProvider] = None, 1140 **kwargs: Any, 1141 ) -> BearerAuthenticator: 1142 if token_provider is not None and model.api_token != "": 1143 raise ValueError( 1144 "If token_provider is set, api_token is ignored and has to be set to empty string." 1145 ) 1146 return BearerAuthenticator( 1147 token_provider=( 1148 token_provider 1149 if token_provider is not None 1150 else InterpolatedStringTokenProvider( 1151 api_token=model.api_token or "", 1152 config=config, 1153 parameters=model.parameters or {}, 1154 ) 1155 ), 1156 config=config, 1157 parameters=model.parameters or {}, 1158 ) 1159 1160 @staticmethod 1161 def create_dynamic_stream_check_config( 1162 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1163 ) -> DynamicStreamCheckConfig: 1164 return DynamicStreamCheckConfig( 1165 dynamic_stream_name=model.dynamic_stream_name, 1166 stream_count=model.stream_count or 0, 1167 ) 1168 1169 def create_check_stream( 1170 self, model: CheckStreamModel, config: Config, **kwargs: Any 1171 ) -> CheckStream: 1172 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1173 raise ValueError( 1174 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1175 ) 1176 1177 dynamic_streams_check_configs = ( 1178 [ 1179 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1180 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1181 ] 1182 if model.dynamic_streams_check_configs 1183 else [] 1184 ) 1185 1186 return CheckStream( 1187 stream_names=model.stream_names or [], 1188 dynamic_streams_check_configs=dynamic_streams_check_configs, 1189 parameters={}, 1190 ) 1191 1192 @staticmethod 1193 def create_check_dynamic_stream( 1194 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1195 ) -> CheckDynamicStream: 1196 assert model.use_check_availability is not None # for mypy 1197 1198 use_check_availability = model.use_check_availability 1199 1200 return CheckDynamicStream( 1201 stream_count=model.stream_count, 1202 use_check_availability=use_check_availability, 1203 parameters={}, 1204 ) 1205 1206 def create_composite_error_handler( 1207 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1208 ) -> CompositeErrorHandler: 1209 error_handlers = [ 1210 self._create_component_from_model(model=error_handler_model, config=config) 1211 for error_handler_model in model.error_handlers 1212 ] 1213 return CompositeErrorHandler( 1214 error_handlers=error_handlers, parameters=model.parameters or {} 1215 ) 1216 1217 @staticmethod 1218 def create_concurrency_level( 1219 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1220 ) -> ConcurrencyLevel: 1221 return ConcurrencyLevel( 1222 default_concurrency=model.default_concurrency, 1223 max_concurrency=model.max_concurrency, 1224 config=config, 1225 parameters={}, 1226 ) 1227 1228 @staticmethod 1229 def apply_stream_state_migrations( 1230 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1231 ) -> MutableMapping[str, Any]: 1232 if stream_state_migrations: 1233 for state_migration in stream_state_migrations: 1234 if state_migration.should_migrate(stream_state): 1235 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1236 stream_state = dict(state_migration.migrate(stream_state)) 1237 return stream_state 1238 1239 def create_concurrent_cursor_from_datetime_based_cursor( 1240 self, 1241 model_type: Type[BaseModel], 1242 component_definition: ComponentDefinition, 1243 stream_name: str, 1244 stream_namespace: Optional[str], 1245 config: Config, 1246 message_repository: Optional[MessageRepository] = None, 1247 runtime_lookback_window: Optional[datetime.timedelta] = None, 1248 stream_state_migrations: Optional[List[Any]] = None, 1249 **kwargs: Any, 1250 ) -> ConcurrentCursor: 1251 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1252 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1253 # incoming state and connector_state_manager that is initialized when the component factory is created 1254 stream_state = ( 1255 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1256 if "stream_state" not in kwargs 1257 else kwargs["stream_state"] 1258 ) 1259 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1260 1261 component_type = component_definition.get("type") 1262 if component_definition.get("type") != model_type.__name__: 1263 raise ValueError( 1264 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1265 ) 1266 1267 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1268 1269 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1270 raise ValueError( 1271 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1272 ) 1273 1274 interpolated_cursor_field = InterpolatedString.create( 1275 datetime_based_cursor_model.cursor_field, 1276 parameters=datetime_based_cursor_model.parameters or {}, 1277 ) 1278 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1279 1280 interpolated_partition_field_start = InterpolatedString.create( 1281 datetime_based_cursor_model.partition_field_start or "start_time", 1282 parameters=datetime_based_cursor_model.parameters or {}, 1283 ) 1284 interpolated_partition_field_end = InterpolatedString.create( 1285 datetime_based_cursor_model.partition_field_end or "end_time", 1286 parameters=datetime_based_cursor_model.parameters or {}, 1287 ) 1288 1289 slice_boundary_fields = ( 1290 interpolated_partition_field_start.eval(config=config), 1291 interpolated_partition_field_end.eval(config=config), 1292 ) 1293 1294 datetime_format = datetime_based_cursor_model.datetime_format 1295 1296 cursor_granularity = ( 1297 parse_duration(datetime_based_cursor_model.cursor_granularity) 1298 if datetime_based_cursor_model.cursor_granularity 1299 else None 1300 ) 1301 1302 lookback_window = None 1303 interpolated_lookback_window = ( 1304 InterpolatedString.create( 1305 datetime_based_cursor_model.lookback_window, 1306 parameters=datetime_based_cursor_model.parameters or {}, 1307 ) 1308 if datetime_based_cursor_model.lookback_window 1309 else None 1310 ) 1311 if interpolated_lookback_window: 1312 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1313 if evaluated_lookback_window: 1314 lookback_window = parse_duration(evaluated_lookback_window) 1315 1316 connector_state_converter: DateTimeStreamStateConverter 1317 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1318 datetime_format=datetime_format, 1319 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1320 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1321 cursor_granularity=cursor_granularity, 1322 ) 1323 1324 # Adjusts the stream state by applying the runtime lookback window. 1325 # This is used to ensure correct state handling in case of failed partitions. 1326 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1327 if runtime_lookback_window and stream_state_value: 1328 new_stream_state = ( 1329 connector_state_converter.parse_timestamp(stream_state_value) 1330 - runtime_lookback_window 1331 ) 1332 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1333 new_stream_state 1334 ) 1335 1336 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1337 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1338 start_date_runtime_value = self.create_min_max_datetime( 1339 model=datetime_based_cursor_model.start_datetime, config=config 1340 ) 1341 else: 1342 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1343 1344 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1345 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1346 end_date_runtime_value = self.create_min_max_datetime( 1347 model=datetime_based_cursor_model.end_datetime, config=config 1348 ) 1349 else: 1350 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1351 1352 interpolated_start_date = MinMaxDatetime.create( 1353 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1354 parameters=datetime_based_cursor_model.parameters, 1355 ) 1356 interpolated_end_date = ( 1357 None 1358 if not end_date_runtime_value 1359 else MinMaxDatetime.create( 1360 end_date_runtime_value, datetime_based_cursor_model.parameters 1361 ) 1362 ) 1363 1364 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1365 if not interpolated_start_date.datetime_format: 1366 interpolated_start_date.datetime_format = datetime_format 1367 if interpolated_end_date and not interpolated_end_date.datetime_format: 1368 interpolated_end_date.datetime_format = datetime_format 1369 1370 start_date = interpolated_start_date.get_datetime(config=config) 1371 end_date_provider = ( 1372 partial(interpolated_end_date.get_datetime, config) 1373 if interpolated_end_date 1374 else connector_state_converter.get_end_provider() 1375 ) 1376 1377 if ( 1378 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1379 ) or ( 1380 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1381 ): 1382 raise ValueError( 1383 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1384 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1385 ) 1386 1387 # When step is not defined, default to a step size from the starting date to the present moment 1388 step_length = datetime.timedelta.max 1389 interpolated_step = ( 1390 InterpolatedString.create( 1391 datetime_based_cursor_model.step, 1392 parameters=datetime_based_cursor_model.parameters or {}, 1393 ) 1394 if datetime_based_cursor_model.step 1395 else None 1396 ) 1397 if interpolated_step: 1398 evaluated_step = interpolated_step.eval(config) 1399 if evaluated_step: 1400 step_length = parse_duration(evaluated_step) 1401 1402 clamping_strategy: ClampingStrategy = NoClamping() 1403 if datetime_based_cursor_model.clamping: 1404 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1405 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1406 # object which we want to keep agnostic of being low-code 1407 target = InterpolatedString( 1408 string=datetime_based_cursor_model.clamping.target, 1409 parameters=datetime_based_cursor_model.parameters or {}, 1410 ) 1411 evaluated_target = target.eval(config=config) 1412 match evaluated_target: 1413 case "DAY": 1414 clamping_strategy = DayClampingStrategy() 1415 end_date_provider = ClampingEndProvider( 1416 DayClampingStrategy(is_ceiling=False), 1417 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1418 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1419 ) 1420 case "WEEK": 1421 if ( 1422 not datetime_based_cursor_model.clamping.target_details 1423 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1424 ): 1425 raise ValueError( 1426 "Given WEEK clamping, weekday needs to be provided as target_details" 1427 ) 1428 weekday = self._assemble_weekday( 1429 datetime_based_cursor_model.clamping.target_details["weekday"] 1430 ) 1431 clamping_strategy = WeekClampingStrategy(weekday) 1432 end_date_provider = ClampingEndProvider( 1433 WeekClampingStrategy(weekday, is_ceiling=False), 1434 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1435 granularity=cursor_granularity or datetime.timedelta(days=1), 1436 ) 1437 case "MONTH": 1438 clamping_strategy = MonthClampingStrategy() 1439 end_date_provider = ClampingEndProvider( 1440 MonthClampingStrategy(is_ceiling=False), 1441 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1442 granularity=cursor_granularity or datetime.timedelta(days=1), 1443 ) 1444 case _: 1445 raise ValueError( 1446 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1447 ) 1448 1449 return ConcurrentCursor( 1450 stream_name=stream_name, 1451 stream_namespace=stream_namespace, 1452 stream_state=stream_state, 1453 message_repository=message_repository or self._message_repository, 1454 connector_state_manager=self._connector_state_manager, 1455 connector_state_converter=connector_state_converter, 1456 cursor_field=cursor_field, 1457 slice_boundary_fields=slice_boundary_fields, 1458 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1459 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1460 lookback_window=lookback_window, 1461 slice_range=step_length, 1462 cursor_granularity=cursor_granularity, 1463 clamping_strategy=clamping_strategy, 1464 ) 1465 1466 def create_concurrent_cursor_from_incrementing_count_cursor( 1467 self, 1468 model_type: Type[BaseModel], 1469 component_definition: ComponentDefinition, 1470 stream_name: str, 1471 stream_namespace: Optional[str], 1472 config: Config, 1473 message_repository: Optional[MessageRepository] = None, 1474 **kwargs: Any, 1475 ) -> ConcurrentCursor: 1476 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1477 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1478 # incoming state and connector_state_manager that is initialized when the component factory is created 1479 stream_state = ( 1480 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1481 if "stream_state" not in kwargs 1482 else kwargs["stream_state"] 1483 ) 1484 1485 component_type = component_definition.get("type") 1486 if component_definition.get("type") != model_type.__name__: 1487 raise ValueError( 1488 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1489 ) 1490 1491 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1492 1493 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1494 raise ValueError( 1495 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1496 ) 1497 1498 interpolated_start_value = ( 1499 InterpolatedString.create( 1500 incrementing_count_cursor_model.start_value, # type: ignore 1501 parameters=incrementing_count_cursor_model.parameters or {}, 1502 ) 1503 if incrementing_count_cursor_model.start_value 1504 else 0 1505 ) 1506 1507 interpolated_cursor_field = InterpolatedString.create( 1508 incrementing_count_cursor_model.cursor_field, 1509 parameters=incrementing_count_cursor_model.parameters or {}, 1510 ) 1511 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1512 1513 connector_state_converter = IncrementingCountStreamStateConverter( 1514 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1515 ) 1516 1517 return ConcurrentCursor( 1518 stream_name=stream_name, 1519 stream_namespace=stream_namespace, 1520 stream_state=stream_state, 1521 message_repository=message_repository or self._message_repository, 1522 connector_state_manager=self._connector_state_manager, 1523 connector_state_converter=connector_state_converter, 1524 cursor_field=cursor_field, 1525 slice_boundary_fields=None, 1526 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1527 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1528 ) 1529 1530 def _assemble_weekday(self, weekday: str) -> Weekday: 1531 match weekday: 1532 case "MONDAY": 1533 return Weekday.MONDAY 1534 case "TUESDAY": 1535 return Weekday.TUESDAY 1536 case "WEDNESDAY": 1537 return Weekday.WEDNESDAY 1538 case "THURSDAY": 1539 return Weekday.THURSDAY 1540 case "FRIDAY": 1541 return Weekday.FRIDAY 1542 case "SATURDAY": 1543 return Weekday.SATURDAY 1544 case "SUNDAY": 1545 return Weekday.SUNDAY 1546 case _: 1547 raise ValueError(f"Unknown weekday {weekday}") 1548 1549 def create_concurrent_cursor_from_perpartition_cursor( 1550 self, 1551 state_manager: ConnectorStateManager, 1552 model_type: Type[BaseModel], 1553 component_definition: ComponentDefinition, 1554 stream_name: str, 1555 stream_namespace: Optional[str], 1556 config: Config, 1557 stream_state: MutableMapping[str, Any], 1558 partition_router: PartitionRouter, 1559 stream_state_migrations: Optional[List[Any]] = None, 1560 **kwargs: Any, 1561 ) -> ConcurrentPerPartitionCursor: 1562 component_type = component_definition.get("type") 1563 if component_definition.get("type") != model_type.__name__: 1564 raise ValueError( 1565 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1566 ) 1567 1568 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1569 1570 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1571 raise ValueError( 1572 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1573 ) 1574 1575 interpolated_cursor_field = InterpolatedString.create( 1576 datetime_based_cursor_model.cursor_field, 1577 parameters=datetime_based_cursor_model.parameters or {}, 1578 ) 1579 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1580 1581 datetime_format = datetime_based_cursor_model.datetime_format 1582 1583 cursor_granularity = ( 1584 parse_duration(datetime_based_cursor_model.cursor_granularity) 1585 if datetime_based_cursor_model.cursor_granularity 1586 else None 1587 ) 1588 1589 connector_state_converter: DateTimeStreamStateConverter 1590 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1591 datetime_format=datetime_format, 1592 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1593 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1594 cursor_granularity=cursor_granularity, 1595 ) 1596 1597 # Create the cursor factory 1598 cursor_factory = ConcurrentCursorFactory( 1599 partial( 1600 self.create_concurrent_cursor_from_datetime_based_cursor, 1601 state_manager=state_manager, 1602 model_type=model_type, 1603 component_definition=component_definition, 1604 stream_name=stream_name, 1605 stream_namespace=stream_namespace, 1606 config=config, 1607 message_repository=NoopMessageRepository(), 1608 stream_state_migrations=stream_state_migrations, 1609 ) 1610 ) 1611 1612 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1613 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1614 use_global_cursor = isinstance( 1615 partition_router, GroupingPartitionRouter 1616 ) or component_definition.get("global_substream_cursor", False) 1617 1618 # Return the concurrent cursor and state converter 1619 return ConcurrentPerPartitionCursor( 1620 cursor_factory=cursor_factory, 1621 partition_router=partition_router, 1622 stream_name=stream_name, 1623 stream_namespace=stream_namespace, 1624 stream_state=stream_state, 1625 message_repository=self._message_repository, # type: ignore 1626 connector_state_manager=state_manager, 1627 connector_state_converter=connector_state_converter, 1628 cursor_field=cursor_field, 1629 use_global_cursor=use_global_cursor, 1630 ) 1631 1632 @staticmethod 1633 def create_constant_backoff_strategy( 1634 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1635 ) -> ConstantBackoffStrategy: 1636 return ConstantBackoffStrategy( 1637 backoff_time_in_seconds=model.backoff_time_in_seconds, 1638 config=config, 1639 parameters=model.parameters or {}, 1640 ) 1641 1642 def create_cursor_pagination( 1643 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1644 ) -> CursorPaginationStrategy: 1645 if isinstance(decoder, PaginationDecoderDecorator): 1646 inner_decoder = decoder.decoder 1647 else: 1648 inner_decoder = decoder 1649 decoder = PaginationDecoderDecorator(decoder=decoder) 1650 1651 if self._is_supported_decoder_for_pagination(inner_decoder): 1652 decoder_to_use = decoder 1653 else: 1654 raise ValueError( 1655 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1656 ) 1657 1658 return CursorPaginationStrategy( 1659 cursor_value=model.cursor_value, 1660 decoder=decoder_to_use, 1661 page_size=model.page_size, 1662 stop_condition=model.stop_condition, 1663 config=config, 1664 parameters=model.parameters or {}, 1665 ) 1666 1667 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1668 """ 1669 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1670 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1671 :param model: The Pydantic model of the custom component being created 1672 :param config: The custom defined connector config 1673 :return: The declarative component built from the Pydantic model to be used at runtime 1674 """ 1675 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1676 component_fields = get_type_hints(custom_component_class) 1677 model_args = model.dict() 1678 model_args["config"] = config 1679 1680 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1681 # we defer to these arguments over the component's definition 1682 for key, arg in kwargs.items(): 1683 model_args[key] = arg 1684 1685 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1686 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1687 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1688 for model_field, model_value in model_args.items(): 1689 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1690 if ( 1691 isinstance(model_value, dict) 1692 and "type" not in model_value 1693 and model_field in component_fields 1694 ): 1695 derived_type = self._derive_component_type_from_type_hints( 1696 component_fields.get(model_field) 1697 ) 1698 if derived_type: 1699 model_value["type"] = derived_type 1700 1701 if self._is_component(model_value): 1702 model_args[model_field] = self._create_nested_component( 1703 model, model_field, model_value, config 1704 ) 1705 elif isinstance(model_value, list): 1706 vals = [] 1707 for v in model_value: 1708 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1709 derived_type = self._derive_component_type_from_type_hints( 1710 component_fields.get(model_field) 1711 ) 1712 if derived_type: 1713 v["type"] = derived_type 1714 if self._is_component(v): 1715 vals.append(self._create_nested_component(model, model_field, v, config)) 1716 else: 1717 vals.append(v) 1718 model_args[model_field] = vals 1719 1720 kwargs = { 1721 class_field: model_args[class_field] 1722 for class_field in component_fields.keys() 1723 if class_field in model_args 1724 } 1725 return custom_component_class(**kwargs) 1726 1727 @staticmethod 1728 def _get_class_from_fully_qualified_class_name( 1729 full_qualified_class_name: str, 1730 ) -> Any: 1731 """Get a class from its fully qualified name. 1732 1733 If a custom components module is needed, we assume it is already registered - probably 1734 as `source_declarative_manifest.components` or `components`. 1735 1736 Args: 1737 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1738 1739 Returns: 1740 Any: The class object. 1741 1742 Raises: 1743 ValueError: If the class cannot be loaded. 1744 """ 1745 split = full_qualified_class_name.split(".") 1746 module_name_full = ".".join(split[:-1]) 1747 class_name = split[-1] 1748 1749 try: 1750 module_ref = importlib.import_module(module_name_full) 1751 except ModuleNotFoundError as e: 1752 if split[0] == "source_declarative_manifest": 1753 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1754 try: 1755 import os 1756 1757 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1758 module_ref = importlib.import_module( 1759 module_name_with_source_declarative_manifest 1760 ) 1761 except ModuleNotFoundError: 1762 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1763 else: 1764 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1765 1766 try: 1767 return getattr(module_ref, class_name) 1768 except AttributeError as e: 1769 raise ValueError( 1770 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1771 ) from e 1772 1773 @staticmethod 1774 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1775 interface = field_type 1776 while True: 1777 origin = get_origin(interface) 1778 if origin: 1779 # Unnest types until we reach the raw type 1780 # List[T] -> T 1781 # Optional[List[T]] -> T 1782 args = get_args(interface) 1783 interface = args[0] 1784 else: 1785 break 1786 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1787 return interface.__name__ 1788 return None 1789 1790 @staticmethod 1791 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1792 if not cls: 1793 return False 1794 return cls.__module__ == "builtins" 1795 1796 @staticmethod 1797 def _extract_missing_parameters(error: TypeError) -> List[str]: 1798 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1799 if parameter_search: 1800 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1801 else: 1802 return [] 1803 1804 def _create_nested_component( 1805 self, model: Any, model_field: str, model_value: Any, config: Config 1806 ) -> Any: 1807 type_name = model_value.get("type", None) 1808 if not type_name: 1809 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1810 return model_value 1811 1812 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1813 if model_type: 1814 parsed_model = model_type.parse_obj(model_value) 1815 try: 1816 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1817 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1818 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1819 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1820 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1821 # are needed by a component and could not be shared. 1822 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1823 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1824 model_parameters = model_value.get("$parameters", {}) 1825 matching_parameters = { 1826 kwarg: model_parameters[kwarg] 1827 for kwarg in constructor_kwargs 1828 if kwarg in model_parameters 1829 } 1830 return self._create_component_from_model( 1831 model=parsed_model, config=config, **matching_parameters 1832 ) 1833 except TypeError as error: 1834 missing_parameters = self._extract_missing_parameters(error) 1835 if missing_parameters: 1836 raise ValueError( 1837 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1838 + ", ".join( 1839 ( 1840 f"{type_name}.$parameters.{parameter}" 1841 for parameter in missing_parameters 1842 ) 1843 ) 1844 ) 1845 raise TypeError( 1846 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1847 ) 1848 else: 1849 raise ValueError( 1850 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1851 ) 1852 1853 @staticmethod 1854 def _is_component(model_value: Any) -> bool: 1855 return isinstance(model_value, dict) and model_value.get("type") is not None 1856 1857 def create_datetime_based_cursor( 1858 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1859 ) -> DatetimeBasedCursor: 1860 start_datetime: Union[str, MinMaxDatetime] = ( 1861 model.start_datetime 1862 if isinstance(model.start_datetime, str) 1863 else self.create_min_max_datetime(model.start_datetime, config) 1864 ) 1865 end_datetime: Union[str, MinMaxDatetime, None] = None 1866 if model.is_data_feed and model.end_datetime: 1867 raise ValueError("Data feed does not support end_datetime") 1868 if model.is_data_feed and model.is_client_side_incremental: 1869 raise ValueError( 1870 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1871 ) 1872 if model.end_datetime: 1873 end_datetime = ( 1874 model.end_datetime 1875 if isinstance(model.end_datetime, str) 1876 else self.create_min_max_datetime(model.end_datetime, config) 1877 ) 1878 1879 end_time_option = ( 1880 self._create_component_from_model( 1881 model.end_time_option, config, parameters=model.parameters or {} 1882 ) 1883 if model.end_time_option 1884 else None 1885 ) 1886 start_time_option = ( 1887 self._create_component_from_model( 1888 model.start_time_option, config, parameters=model.parameters or {} 1889 ) 1890 if model.start_time_option 1891 else None 1892 ) 1893 1894 return DatetimeBasedCursor( 1895 cursor_field=model.cursor_field, 1896 cursor_datetime_formats=model.cursor_datetime_formats 1897 if model.cursor_datetime_formats 1898 else [], 1899 cursor_granularity=model.cursor_granularity, 1900 datetime_format=model.datetime_format, 1901 end_datetime=end_datetime, 1902 start_datetime=start_datetime, 1903 step=model.step, 1904 end_time_option=end_time_option, 1905 lookback_window=model.lookback_window, 1906 start_time_option=start_time_option, 1907 partition_field_end=model.partition_field_end, 1908 partition_field_start=model.partition_field_start, 1909 message_repository=self._message_repository, 1910 is_compare_strictly=model.is_compare_strictly, 1911 config=config, 1912 parameters=model.parameters or {}, 1913 ) 1914 1915 def create_declarative_stream( 1916 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1917 ) -> DeclarativeStream: 1918 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1919 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1920 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1921 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1922 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1923 1924 primary_key = model.primary_key.__root__ if model.primary_key else None 1925 stop_condition_on_cursor = ( 1926 model.incremental_sync 1927 and hasattr(model.incremental_sync, "is_data_feed") 1928 and model.incremental_sync.is_data_feed 1929 ) 1930 client_side_incremental_sync = None 1931 if ( 1932 model.incremental_sync 1933 and hasattr(model.incremental_sync, "is_client_side_incremental") 1934 and model.incremental_sync.is_client_side_incremental 1935 ): 1936 supported_slicers = ( 1937 DatetimeBasedCursor, 1938 GlobalSubstreamCursor, 1939 PerPartitionWithGlobalCursor, 1940 ) 1941 if combined_slicers and not isinstance(combined_slicers, supported_slicers): 1942 raise ValueError( 1943 "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead" 1944 ) 1945 cursor = ( 1946 combined_slicers 1947 if isinstance( 1948 combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor) 1949 ) 1950 else self._create_component_from_model(model=model.incremental_sync, config=config) 1951 ) 1952 1953 client_side_incremental_sync = {"cursor": cursor} 1954 1955 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1956 cursor_model = model.incremental_sync 1957 1958 end_time_option = ( 1959 self._create_component_from_model( 1960 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1961 ) 1962 if cursor_model.end_time_option 1963 else None 1964 ) 1965 start_time_option = ( 1966 self._create_component_from_model( 1967 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1968 ) 1969 if cursor_model.start_time_option 1970 else None 1971 ) 1972 1973 request_options_provider = DatetimeBasedRequestOptionsProvider( 1974 start_time_option=start_time_option, 1975 end_time_option=end_time_option, 1976 partition_field_start=cursor_model.partition_field_end, 1977 partition_field_end=cursor_model.partition_field_end, 1978 config=config, 1979 parameters=model.parameters or {}, 1980 ) 1981 elif model.incremental_sync and isinstance( 1982 model.incremental_sync, IncrementingCountCursorModel 1983 ): 1984 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1985 1986 start_time_option = ( 1987 self._create_component_from_model( 1988 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1989 config, 1990 parameters=cursor_model.parameters or {}, 1991 ) 1992 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1993 else None 1994 ) 1995 1996 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1997 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1998 partition_field_start = "start" 1999 2000 request_options_provider = DatetimeBasedRequestOptionsProvider( 2001 start_time_option=start_time_option, 2002 partition_field_start=partition_field_start, 2003 config=config, 2004 parameters=model.parameters or {}, 2005 ) 2006 else: 2007 request_options_provider = None 2008 2009 transformations = [] 2010 if model.transformations: 2011 for transformation_model in model.transformations: 2012 transformations.append( 2013 self._create_component_from_model(model=transformation_model, config=config) 2014 ) 2015 file_uploader = None 2016 if model.file_uploader: 2017 file_uploader = self._create_component_from_model( 2018 model=model.file_uploader, config=config 2019 ) 2020 2021 retriever = self._create_component_from_model( 2022 model=model.retriever, 2023 config=config, 2024 name=model.name, 2025 primary_key=primary_key, 2026 stream_slicer=combined_slicers, 2027 request_options_provider=request_options_provider, 2028 stop_condition_on_cursor=stop_condition_on_cursor, 2029 client_side_incremental_sync=client_side_incremental_sync, 2030 transformations=transformations, 2031 file_uploader=file_uploader, 2032 incremental_sync=model.incremental_sync, 2033 ) 2034 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 2035 2036 if model.state_migrations: 2037 state_transformations = [ 2038 self._create_component_from_model(state_migration, config, declarative_stream=model) 2039 for state_migration in model.state_migrations 2040 ] 2041 else: 2042 state_transformations = [] 2043 2044 schema_loader: Union[ 2045 CompositeSchemaLoader, 2046 DefaultSchemaLoader, 2047 DynamicSchemaLoader, 2048 InlineSchemaLoader, 2049 JsonFileSchemaLoader, 2050 ] 2051 if model.schema_loader and isinstance(model.schema_loader, list): 2052 nested_schema_loaders = [ 2053 self._create_component_from_model(model=nested_schema_loader, config=config) 2054 for nested_schema_loader in model.schema_loader 2055 ] 2056 schema_loader = CompositeSchemaLoader( 2057 schema_loaders=nested_schema_loaders, parameters={} 2058 ) 2059 elif model.schema_loader: 2060 schema_loader = self._create_component_from_model( 2061 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2062 config=config, 2063 ) 2064 else: 2065 options = model.parameters or {} 2066 if "name" not in options: 2067 options["name"] = model.name 2068 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2069 2070 return DeclarativeStream( 2071 name=model.name or "", 2072 primary_key=primary_key, 2073 retriever=retriever, 2074 schema_loader=schema_loader, 2075 stream_cursor_field=cursor_field or "", 2076 state_migrations=state_transformations, 2077 config=config, 2078 parameters=model.parameters or {}, 2079 ) 2080 2081 def _build_stream_slicer_from_partition_router( 2082 self, 2083 model: Union[ 2084 AsyncRetrieverModel, 2085 CustomRetrieverModel, 2086 SimpleRetrieverModel, 2087 ], 2088 config: Config, 2089 stream_name: Optional[str] = None, 2090 ) -> Optional[PartitionRouter]: 2091 if ( 2092 hasattr(model, "partition_router") 2093 and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel) 2094 and model.partition_router 2095 ): 2096 stream_slicer_model = model.partition_router 2097 if isinstance(stream_slicer_model, list): 2098 return CartesianProductStreamSlicer( 2099 [ 2100 self._create_component_from_model( 2101 model=slicer, config=config, stream_name=stream_name or "" 2102 ) 2103 for slicer in stream_slicer_model 2104 ], 2105 parameters={}, 2106 ) 2107 else: 2108 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2109 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2110 ) 2111 return None 2112 2113 def _build_incremental_cursor( 2114 self, 2115 model: DeclarativeStreamModel, 2116 stream_slicer: Optional[PartitionRouter], 2117 config: Config, 2118 ) -> Optional[StreamSlicer]: 2119 if model.incremental_sync and stream_slicer: 2120 if model.retriever.type == "AsyncRetriever": 2121 stream_name = model.name or "" 2122 stream_namespace = None 2123 stream_state = self._connector_state_manager.get_stream_state( 2124 stream_name, stream_namespace 2125 ) 2126 state_transformations = ( 2127 [ 2128 self._create_component_from_model( 2129 state_migration, config, declarative_stream=model 2130 ) 2131 for state_migration in model.state_migrations 2132 ] 2133 if model.state_migrations 2134 else [] 2135 ) 2136 2137 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2138 state_manager=self._connector_state_manager, 2139 model_type=DatetimeBasedCursorModel, 2140 component_definition=model.incremental_sync.__dict__, 2141 stream_name=stream_name, 2142 stream_namespace=stream_namespace, 2143 config=config or {}, 2144 stream_state=stream_state, 2145 stream_state_migrations=state_transformations, 2146 partition_router=stream_slicer, 2147 ) 2148 2149 incremental_sync_model = model.incremental_sync 2150 cursor_component = self._create_component_from_model( 2151 model=incremental_sync_model, config=config 2152 ) 2153 is_global_cursor = ( 2154 hasattr(incremental_sync_model, "global_substream_cursor") 2155 and incremental_sync_model.global_substream_cursor 2156 ) 2157 2158 if is_global_cursor: 2159 return GlobalSubstreamCursor( 2160 stream_cursor=cursor_component, partition_router=stream_slicer 2161 ) 2162 return PerPartitionWithGlobalCursor( 2163 cursor_factory=CursorFactory( 2164 lambda: self._create_component_from_model( 2165 model=incremental_sync_model, config=config 2166 ), 2167 ), 2168 partition_router=stream_slicer, 2169 stream_cursor=cursor_component, 2170 ) 2171 elif model.incremental_sync: 2172 if model.retriever.type == "AsyncRetriever": 2173 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2174 model_type=DatetimeBasedCursorModel, 2175 component_definition=model.incremental_sync.__dict__, 2176 stream_name=model.name or "", 2177 stream_namespace=None, 2178 config=config or {}, 2179 stream_state_migrations=model.state_migrations, 2180 ) 2181 return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync 2182 return None 2183 2184 def _build_resumable_cursor( 2185 self, 2186 model: Union[ 2187 AsyncRetrieverModel, 2188 CustomRetrieverModel, 2189 SimpleRetrieverModel, 2190 ], 2191 stream_slicer: Optional[PartitionRouter], 2192 ) -> Optional[StreamSlicer]: 2193 if hasattr(model, "paginator") and model.paginator and not stream_slicer: 2194 # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` 2195 return ResumableFullRefreshCursor(parameters={}) 2196 elif stream_slicer: 2197 # For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor` 2198 return PerPartitionCursor( 2199 cursor_factory=CursorFactory( 2200 create_function=partial(ChildPartitionResumableFullRefreshCursor, {}) 2201 ), 2202 partition_router=stream_slicer, 2203 ) 2204 return None 2205 2206 def _merge_stream_slicers( 2207 self, model: DeclarativeStreamModel, config: Config 2208 ) -> Optional[StreamSlicer]: 2209 retriever_model = model.retriever 2210 2211 stream_slicer = self._build_stream_slicer_from_partition_router( 2212 retriever_model, config, stream_name=model.name 2213 ) 2214 2215 if retriever_model.type == "AsyncRetriever": 2216 is_not_datetime_cursor = ( 2217 model.incremental_sync.type != "DatetimeBasedCursor" 2218 if model.incremental_sync 2219 else None 2220 ) 2221 is_partition_router = ( 2222 bool(retriever_model.partition_router) if model.incremental_sync else None 2223 ) 2224 2225 if is_not_datetime_cursor: 2226 # We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the 2227 # support or unordered slices (for example, when we trigger reports for January and February, the report 2228 # in February can be completed first). Once we have support for custom concurrent cursor or have a new 2229 # implementation available in the CDK, we can enable more cursors here. 2230 raise ValueError( 2231 "AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet." 2232 ) 2233 2234 if is_partition_router and not stream_slicer: 2235 # Note that this development is also done in parallel to the per partition development which once merged 2236 # we could support here by calling create_concurrent_cursor_from_perpartition_cursor 2237 raise ValueError("Per partition state is not supported yet for AsyncRetriever.") 2238 2239 if model.incremental_sync: 2240 return self._build_incremental_cursor(model, stream_slicer, config) 2241 2242 return ( 2243 stream_slicer 2244 if self._disable_resumable_full_refresh 2245 else self._build_resumable_cursor(retriever_model, stream_slicer) 2246 ) 2247 2248 def create_default_error_handler( 2249 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2250 ) -> DefaultErrorHandler: 2251 backoff_strategies = [] 2252 if model.backoff_strategies: 2253 for backoff_strategy_model in model.backoff_strategies: 2254 backoff_strategies.append( 2255 self._create_component_from_model(model=backoff_strategy_model, config=config) 2256 ) 2257 2258 response_filters = [] 2259 if model.response_filters: 2260 for response_filter_model in model.response_filters: 2261 response_filters.append( 2262 self._create_component_from_model(model=response_filter_model, config=config) 2263 ) 2264 response_filters.append( 2265 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2266 ) 2267 2268 return DefaultErrorHandler( 2269 backoff_strategies=backoff_strategies, 2270 max_retries=model.max_retries, 2271 response_filters=response_filters, 2272 config=config, 2273 parameters=model.parameters or {}, 2274 ) 2275 2276 def create_default_paginator( 2277 self, 2278 model: DefaultPaginatorModel, 2279 config: Config, 2280 *, 2281 url_base: str, 2282 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2283 decoder: Optional[Decoder] = None, 2284 cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None, 2285 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2286 if decoder: 2287 if self._is_supported_decoder_for_pagination(decoder): 2288 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2289 else: 2290 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2291 else: 2292 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2293 page_size_option = ( 2294 self._create_component_from_model(model=model.page_size_option, config=config) 2295 if model.page_size_option 2296 else None 2297 ) 2298 page_token_option = ( 2299 self._create_component_from_model(model=model.page_token_option, config=config) 2300 if model.page_token_option 2301 else None 2302 ) 2303 pagination_strategy = self._create_component_from_model( 2304 model=model.pagination_strategy, 2305 config=config, 2306 decoder=decoder_to_use, 2307 extractor_model=extractor_model, 2308 ) 2309 if cursor_used_for_stop_condition: 2310 pagination_strategy = StopConditionPaginationStrategyDecorator( 2311 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2312 ) 2313 paginator = DefaultPaginator( 2314 decoder=decoder_to_use, 2315 page_size_option=page_size_option, 2316 page_token_option=page_token_option, 2317 pagination_strategy=pagination_strategy, 2318 url_base=url_base, 2319 config=config, 2320 parameters=model.parameters or {}, 2321 ) 2322 if self._limit_pages_fetched_per_slice: 2323 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2324 return paginator 2325 2326 def create_dpath_extractor( 2327 self, 2328 model: DpathExtractorModel, 2329 config: Config, 2330 decoder: Optional[Decoder] = None, 2331 **kwargs: Any, 2332 ) -> DpathExtractor: 2333 if decoder: 2334 decoder_to_use = decoder 2335 else: 2336 decoder_to_use = JsonDecoder(parameters={}) 2337 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2338 return DpathExtractor( 2339 decoder=decoder_to_use, 2340 field_path=model_field_path, 2341 config=config, 2342 parameters=model.parameters or {}, 2343 ) 2344 2345 @staticmethod 2346 def create_response_to_file_extractor( 2347 model: ResponseToFileExtractorModel, 2348 **kwargs: Any, 2349 ) -> ResponseToFileExtractor: 2350 return ResponseToFileExtractor(parameters=model.parameters or {}) 2351 2352 @staticmethod 2353 def create_exponential_backoff_strategy( 2354 model: ExponentialBackoffStrategyModel, config: Config 2355 ) -> ExponentialBackoffStrategy: 2356 return ExponentialBackoffStrategy( 2357 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2358 ) 2359 2360 @staticmethod 2361 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2362 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2363 2364 def create_http_requester( 2365 self, 2366 model: HttpRequesterModel, 2367 config: Config, 2368 decoder: Decoder = JsonDecoder(parameters={}), 2369 query_properties_key: Optional[str] = None, 2370 use_cache: Optional[bool] = None, 2371 *, 2372 name: str, 2373 ) -> HttpRequester: 2374 authenticator = ( 2375 self._create_component_from_model( 2376 model=model.authenticator, 2377 config=config, 2378 url_base=model.url or model.url_base, 2379 name=name, 2380 decoder=decoder, 2381 ) 2382 if model.authenticator 2383 else None 2384 ) 2385 error_handler = ( 2386 self._create_component_from_model(model=model.error_handler, config=config) 2387 if model.error_handler 2388 else DefaultErrorHandler( 2389 backoff_strategies=[], 2390 response_filters=[], 2391 config=config, 2392 parameters=model.parameters or {}, 2393 ) 2394 ) 2395 2396 api_budget = self._api_budget 2397 2398 # Removes QueryProperties components from the interpolated mappings because it has been designed 2399 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2400 # instead of through jinja interpolation 2401 request_parameters: Optional[Union[str, Mapping[str, str]]] 2402 if isinstance(model.request_parameters, Mapping): 2403 request_parameters = self._remove_query_properties(model.request_parameters) 2404 else: 2405 request_parameters = model.request_parameters 2406 2407 request_options_provider = InterpolatedRequestOptionsProvider( 2408 request_body=model.request_body, 2409 request_body_data=model.request_body_data, 2410 request_body_json=model.request_body_json, 2411 request_headers=model.request_headers, 2412 request_parameters=request_parameters, 2413 query_properties_key=query_properties_key, 2414 config=config, 2415 parameters=model.parameters or {}, 2416 ) 2417 2418 assert model.use_cache is not None # for mypy 2419 assert model.http_method is not None # for mypy 2420 2421 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2422 2423 return HttpRequester( 2424 name=name, 2425 url=model.url, 2426 url_base=model.url_base, 2427 path=model.path, 2428 authenticator=authenticator, 2429 error_handler=error_handler, 2430 api_budget=api_budget, 2431 http_method=HttpMethod[model.http_method.value], 2432 request_options_provider=request_options_provider, 2433 config=config, 2434 disable_retries=self._disable_retries, 2435 parameters=model.parameters or {}, 2436 message_repository=self._message_repository, 2437 use_cache=should_use_cache, 2438 decoder=decoder, 2439 stream_response=decoder.is_stream_response() if decoder else False, 2440 ) 2441 2442 @staticmethod 2443 def create_http_response_filter( 2444 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2445 ) -> HttpResponseFilter: 2446 if model.action: 2447 action = ResponseAction(model.action.value) 2448 else: 2449 action = None 2450 2451 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2452 2453 http_codes = ( 2454 set(model.http_codes) if model.http_codes else set() 2455 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2456 2457 return HttpResponseFilter( 2458 action=action, 2459 failure_type=failure_type, 2460 error_message=model.error_message or "", 2461 error_message_contains=model.error_message_contains or "", 2462 http_codes=http_codes, 2463 predicate=model.predicate or "", 2464 config=config, 2465 parameters=model.parameters or {}, 2466 ) 2467 2468 @staticmethod 2469 def create_inline_schema_loader( 2470 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2471 ) -> InlineSchemaLoader: 2472 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2473 2474 def create_complex_field_type( 2475 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2476 ) -> ComplexFieldType: 2477 items = ( 2478 self._create_component_from_model(model=model.items, config=config) 2479 if isinstance(model.items, ComplexFieldTypeModel) 2480 else model.items 2481 ) 2482 2483 return ComplexFieldType(field_type=model.field_type, items=items) 2484 2485 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2486 target_type = ( 2487 self._create_component_from_model(model=model.target_type, config=config) 2488 if isinstance(model.target_type, ComplexFieldTypeModel) 2489 else model.target_type 2490 ) 2491 2492 return TypesMap( 2493 target_type=target_type, 2494 current_type=model.current_type, 2495 condition=model.condition if model.condition is not None else "True", 2496 ) 2497 2498 def create_schema_type_identifier( 2499 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2500 ) -> SchemaTypeIdentifier: 2501 types_mapping = [] 2502 if model.types_mapping: 2503 types_mapping.extend( 2504 [ 2505 self._create_component_from_model(types_map, config=config) 2506 for types_map in model.types_mapping 2507 ] 2508 ) 2509 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2510 [x for x in model.schema_pointer] if model.schema_pointer else [] 2511 ) 2512 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2513 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2514 [x for x in model.type_pointer] if model.type_pointer else None 2515 ) 2516 2517 return SchemaTypeIdentifier( 2518 schema_pointer=model_schema_pointer, 2519 key_pointer=model_key_pointer, 2520 type_pointer=model_type_pointer, 2521 types_mapping=types_mapping, 2522 parameters=model.parameters or {}, 2523 ) 2524 2525 def create_dynamic_schema_loader( 2526 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2527 ) -> DynamicSchemaLoader: 2528 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2529 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2530 2531 schema_transformations = [] 2532 if model.schema_transformations: 2533 for transformation_model in model.schema_transformations: 2534 schema_transformations.append( 2535 self._create_component_from_model(model=transformation_model, config=config) 2536 ) 2537 name = "dynamic_properties" 2538 retriever = self._create_component_from_model( 2539 model=model.retriever, 2540 config=config, 2541 name=name, 2542 primary_key=None, 2543 stream_slicer=combined_slicers, 2544 transformations=[], 2545 use_cache=True, 2546 log_formatter=( 2547 lambda response: format_http_message( 2548 response, 2549 f"Schema loader '{name}' request", 2550 f"Request performed in order to extract schema.", 2551 name, 2552 is_auxiliary=True, 2553 ) 2554 ), 2555 ) 2556 schema_type_identifier = self._create_component_from_model( 2557 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2558 ) 2559 schema_filter = ( 2560 self._create_component_from_model( 2561 model.schema_filter, config=config, parameters=model.parameters or {} 2562 ) 2563 if model.schema_filter is not None 2564 else None 2565 ) 2566 2567 return DynamicSchemaLoader( 2568 retriever=retriever, 2569 config=config, 2570 schema_transformations=schema_transformations, 2571 schema_filter=schema_filter, 2572 schema_type_identifier=schema_type_identifier, 2573 parameters=model.parameters or {}, 2574 ) 2575 2576 @staticmethod 2577 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2578 return JsonDecoder(parameters={}) 2579 2580 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2581 return CompositeRawDecoder( 2582 parser=ModelToComponentFactory._get_parser(model, config), 2583 stream_response=False if self._emit_connector_builder_messages else True, 2584 ) 2585 2586 def create_jsonl_decoder( 2587 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2588 ) -> Decoder: 2589 return CompositeRawDecoder( 2590 parser=ModelToComponentFactory._get_parser(model, config), 2591 stream_response=False if self._emit_connector_builder_messages else True, 2592 ) 2593 2594 def create_gzip_decoder( 2595 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2596 ) -> Decoder: 2597 _compressed_response_types = { 2598 "gzip", 2599 "x-gzip", 2600 "gzip, deflate", 2601 "x-gzip, deflate", 2602 "application/zip", 2603 "application/gzip", 2604 "application/x-gzip", 2605 "application/x-zip-compressed", 2606 } 2607 2608 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2609 2610 if self._emit_connector_builder_messages: 2611 # This is very surprising but if the response is not streamed, 2612 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2613 # which uses urllib3 directly and does not uncompress the data. 2614 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2615 2616 return CompositeRawDecoder.by_headers( 2617 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2618 stream_response=True, 2619 fallback_parser=gzip_parser.inner_parser, 2620 ) 2621 2622 @staticmethod 2623 def create_incrementing_count_cursor( 2624 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2625 ) -> DatetimeBasedCursor: 2626 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2627 # we still parse models into components. The issue is that there's no runtime implementation of a 2628 # IncrementingCountCursor. 2629 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2630 return DatetimeBasedCursor( 2631 cursor_field=model.cursor_field, 2632 datetime_format="%Y-%m-%d", 2633 start_datetime="2024-12-12", 2634 config=config, 2635 parameters={}, 2636 ) 2637 2638 @staticmethod 2639 def create_iterable_decoder( 2640 model: IterableDecoderModel, config: Config, **kwargs: Any 2641 ) -> IterableDecoder: 2642 return IterableDecoder(parameters={}) 2643 2644 @staticmethod 2645 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2646 return XmlDecoder(parameters={}) 2647 2648 def create_zipfile_decoder( 2649 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2650 ) -> ZipfileDecoder: 2651 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2652 2653 @staticmethod 2654 def _get_parser(model: BaseModel, config: Config) -> Parser: 2655 if isinstance(model, JsonDecoderModel): 2656 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2657 return JsonParser() 2658 elif isinstance(model, JsonlDecoderModel): 2659 return JsonLineParser() 2660 elif isinstance(model, CsvDecoderModel): 2661 return CsvParser( 2662 encoding=model.encoding, 2663 delimiter=model.delimiter, 2664 set_values_to_none=model.set_values_to_none, 2665 ) 2666 elif isinstance(model, GzipDecoderModel): 2667 return GzipParser( 2668 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2669 ) 2670 elif isinstance( 2671 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2672 ): 2673 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2674 2675 raise ValueError(f"Unknown decoder type {model}") 2676 2677 @staticmethod 2678 def create_json_file_schema_loader( 2679 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2680 ) -> JsonFileSchemaLoader: 2681 return JsonFileSchemaLoader( 2682 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2683 ) 2684 2685 @staticmethod 2686 def create_jwt_authenticator( 2687 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2688 ) -> JwtAuthenticator: 2689 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2690 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2691 return JwtAuthenticator( 2692 config=config, 2693 parameters=model.parameters or {}, 2694 algorithm=JwtAlgorithm(model.algorithm.value), 2695 secret_key=model.secret_key, 2696 base64_encode_secret_key=model.base64_encode_secret_key, 2697 token_duration=model.token_duration, 2698 header_prefix=model.header_prefix, 2699 kid=jwt_headers.kid, 2700 typ=jwt_headers.typ, 2701 cty=jwt_headers.cty, 2702 iss=jwt_payload.iss, 2703 sub=jwt_payload.sub, 2704 aud=jwt_payload.aud, 2705 additional_jwt_headers=model.additional_jwt_headers, 2706 additional_jwt_payload=model.additional_jwt_payload, 2707 ) 2708 2709 def create_list_partition_router( 2710 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2711 ) -> ListPartitionRouter: 2712 request_option = ( 2713 self._create_component_from_model(model.request_option, config) 2714 if model.request_option 2715 else None 2716 ) 2717 return ListPartitionRouter( 2718 cursor_field=model.cursor_field, 2719 request_option=request_option, 2720 values=model.values, 2721 config=config, 2722 parameters=model.parameters or {}, 2723 ) 2724 2725 @staticmethod 2726 def create_min_max_datetime( 2727 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2728 ) -> MinMaxDatetime: 2729 return MinMaxDatetime( 2730 datetime=model.datetime, 2731 datetime_format=model.datetime_format or "", 2732 max_datetime=model.max_datetime or "", 2733 min_datetime=model.min_datetime or "", 2734 parameters=model.parameters or {}, 2735 ) 2736 2737 @staticmethod 2738 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2739 return NoAuth(parameters=model.parameters or {}) 2740 2741 @staticmethod 2742 def create_no_pagination( 2743 model: NoPaginationModel, config: Config, **kwargs: Any 2744 ) -> NoPagination: 2745 return NoPagination(parameters={}) 2746 2747 def create_oauth_authenticator( 2748 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2749 ) -> DeclarativeOauth2Authenticator: 2750 profile_assertion = ( 2751 self._create_component_from_model(model.profile_assertion, config=config) 2752 if model.profile_assertion 2753 else None 2754 ) 2755 2756 if model.refresh_token_updater: 2757 # ignore type error because fixing it would have a lot of dependencies, revisit later 2758 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2759 config, 2760 InterpolatedString.create( 2761 model.token_refresh_endpoint, # type: ignore 2762 parameters=model.parameters or {}, 2763 ).eval(config), 2764 access_token_name=InterpolatedString.create( 2765 model.access_token_name or "access_token", parameters=model.parameters or {} 2766 ).eval(config), 2767 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2768 expires_in_name=InterpolatedString.create( 2769 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2770 ).eval(config), 2771 client_id_name=InterpolatedString.create( 2772 model.client_id_name or "client_id", parameters=model.parameters or {} 2773 ).eval(config), 2774 client_id=InterpolatedString.create( 2775 model.client_id, parameters=model.parameters or {} 2776 ).eval(config) 2777 if model.client_id 2778 else model.client_id, 2779 client_secret_name=InterpolatedString.create( 2780 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2781 ).eval(config), 2782 client_secret=InterpolatedString.create( 2783 model.client_secret, parameters=model.parameters or {} 2784 ).eval(config) 2785 if model.client_secret 2786 else model.client_secret, 2787 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2788 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2789 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2790 grant_type_name=InterpolatedString.create( 2791 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2792 ).eval(config), 2793 grant_type=InterpolatedString.create( 2794 model.grant_type or "refresh_token", parameters=model.parameters or {} 2795 ).eval(config), 2796 refresh_request_body=InterpolatedMapping( 2797 model.refresh_request_body or {}, parameters=model.parameters or {} 2798 ).eval(config), 2799 refresh_request_headers=InterpolatedMapping( 2800 model.refresh_request_headers or {}, parameters=model.parameters or {} 2801 ).eval(config), 2802 scopes=model.scopes, 2803 token_expiry_date_format=model.token_expiry_date_format, 2804 message_repository=self._message_repository, 2805 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2806 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2807 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2808 ) 2809 # ignore type error because fixing it would have a lot of dependencies, revisit later 2810 return DeclarativeOauth2Authenticator( # type: ignore 2811 access_token_name=model.access_token_name or "access_token", 2812 access_token_value=model.access_token_value, 2813 client_id_name=model.client_id_name or "client_id", 2814 client_id=model.client_id, 2815 client_secret_name=model.client_secret_name or "client_secret", 2816 client_secret=model.client_secret, 2817 expires_in_name=model.expires_in_name or "expires_in", 2818 grant_type_name=model.grant_type_name or "grant_type", 2819 grant_type=model.grant_type or "refresh_token", 2820 refresh_request_body=model.refresh_request_body, 2821 refresh_request_headers=model.refresh_request_headers, 2822 refresh_token_name=model.refresh_token_name or "refresh_token", 2823 refresh_token=model.refresh_token, 2824 scopes=model.scopes, 2825 token_expiry_date=model.token_expiry_date, 2826 token_expiry_date_format=model.token_expiry_date_format, 2827 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2828 token_refresh_endpoint=model.token_refresh_endpoint, 2829 config=config, 2830 parameters=model.parameters or {}, 2831 message_repository=self._message_repository, 2832 profile_assertion=profile_assertion, 2833 use_profile_assertion=model.use_profile_assertion, 2834 ) 2835 2836 def create_offset_increment( 2837 self, 2838 model: OffsetIncrementModel, 2839 config: Config, 2840 decoder: Decoder, 2841 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2842 **kwargs: Any, 2843 ) -> OffsetIncrement: 2844 if isinstance(decoder, PaginationDecoderDecorator): 2845 inner_decoder = decoder.decoder 2846 else: 2847 inner_decoder = decoder 2848 decoder = PaginationDecoderDecorator(decoder=decoder) 2849 2850 if self._is_supported_decoder_for_pagination(inner_decoder): 2851 decoder_to_use = decoder 2852 else: 2853 raise ValueError( 2854 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2855 ) 2856 2857 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2858 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2859 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2860 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2861 # When we have more time to investigate we can look into reusing the same component. 2862 extractor = ( 2863 self._create_component_from_model( 2864 model=extractor_model, config=config, decoder=decoder_to_use 2865 ) 2866 if extractor_model 2867 else None 2868 ) 2869 2870 return OffsetIncrement( 2871 page_size=model.page_size, 2872 config=config, 2873 decoder=decoder_to_use, 2874 extractor=extractor, 2875 inject_on_first_request=model.inject_on_first_request or False, 2876 parameters=model.parameters or {}, 2877 ) 2878 2879 @staticmethod 2880 def create_page_increment( 2881 model: PageIncrementModel, config: Config, **kwargs: Any 2882 ) -> PageIncrement: 2883 return PageIncrement( 2884 page_size=model.page_size, 2885 config=config, 2886 start_from_page=model.start_from_page or 0, 2887 inject_on_first_request=model.inject_on_first_request or False, 2888 parameters=model.parameters or {}, 2889 ) 2890 2891 def create_parent_stream_config( 2892 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2893 ) -> ParentStreamConfig: 2894 declarative_stream = self._create_component_from_model( 2895 model.stream, config=config, **kwargs 2896 ) 2897 request_option = ( 2898 self._create_component_from_model(model.request_option, config=config) 2899 if model.request_option 2900 else None 2901 ) 2902 2903 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2904 raise ValueError( 2905 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2906 ) 2907 2908 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2909 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2910 ) 2911 2912 return ParentStreamConfig( 2913 parent_key=model.parent_key, 2914 request_option=request_option, 2915 stream=declarative_stream, 2916 partition_field=model.partition_field, 2917 config=config, 2918 incremental_dependency=model.incremental_dependency or False, 2919 parameters=model.parameters or {}, 2920 extra_fields=model.extra_fields, 2921 lazy_read_pointer=model_lazy_read_pointer, 2922 ) 2923 2924 def create_properties_from_endpoint( 2925 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2926 ) -> PropertiesFromEndpoint: 2927 retriever = self._create_component_from_model( 2928 model=model.retriever, 2929 config=config, 2930 name="dynamic_properties", 2931 primary_key=None, 2932 stream_slicer=None, 2933 transformations=[], 2934 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2935 ) 2936 return PropertiesFromEndpoint( 2937 property_field_path=model.property_field_path, 2938 retriever=retriever, 2939 config=config, 2940 parameters=model.parameters or {}, 2941 ) 2942 2943 def create_property_chunking( 2944 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2945 ) -> PropertyChunking: 2946 record_merge_strategy = ( 2947 self._create_component_from_model( 2948 model=model.record_merge_strategy, config=config, **kwargs 2949 ) 2950 if model.record_merge_strategy 2951 else None 2952 ) 2953 2954 property_limit_type: PropertyLimitType 2955 match model.property_limit_type: 2956 case PropertyLimitTypeModel.property_count: 2957 property_limit_type = PropertyLimitType.property_count 2958 case PropertyLimitTypeModel.characters: 2959 property_limit_type = PropertyLimitType.characters 2960 case _: 2961 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2962 2963 return PropertyChunking( 2964 property_limit_type=property_limit_type, 2965 property_limit=model.property_limit, 2966 record_merge_strategy=record_merge_strategy, 2967 config=config, 2968 parameters=model.parameters or {}, 2969 ) 2970 2971 def create_query_properties( 2972 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2973 ) -> QueryProperties: 2974 if isinstance(model.property_list, list): 2975 property_list = model.property_list 2976 else: 2977 property_list = self._create_component_from_model( 2978 model=model.property_list, config=config, **kwargs 2979 ) 2980 2981 property_chunking = ( 2982 self._create_component_from_model( 2983 model=model.property_chunking, config=config, **kwargs 2984 ) 2985 if model.property_chunking 2986 else None 2987 ) 2988 2989 return QueryProperties( 2990 property_list=property_list, 2991 always_include_properties=model.always_include_properties, 2992 property_chunking=property_chunking, 2993 config=config, 2994 parameters=model.parameters or {}, 2995 ) 2996 2997 @staticmethod 2998 def create_record_filter( 2999 model: RecordFilterModel, config: Config, **kwargs: Any 3000 ) -> RecordFilter: 3001 return RecordFilter( 3002 condition=model.condition or "", config=config, parameters=model.parameters or {} 3003 ) 3004 3005 @staticmethod 3006 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3007 return RequestPath(parameters={}) 3008 3009 @staticmethod 3010 def create_request_option( 3011 model: RequestOptionModel, config: Config, **kwargs: Any 3012 ) -> RequestOption: 3013 inject_into = RequestOptionType(model.inject_into.value) 3014 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3015 [ 3016 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3017 for segment in model.field_path 3018 ] 3019 if model.field_path 3020 else None 3021 ) 3022 field_name = ( 3023 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3024 if model.field_name 3025 else None 3026 ) 3027 return RequestOption( 3028 field_name=field_name, 3029 field_path=field_path, 3030 inject_into=inject_into, 3031 parameters=kwargs.get("parameters", {}), 3032 ) 3033 3034 def create_record_selector( 3035 self, 3036 model: RecordSelectorModel, 3037 config: Config, 3038 *, 3039 name: str, 3040 transformations: List[RecordTransformation] | None = None, 3041 decoder: Decoder | None = None, 3042 client_side_incremental_sync: Dict[str, Any] | None = None, 3043 file_uploader: Optional[DefaultFileUploader] = None, 3044 **kwargs: Any, 3045 ) -> RecordSelector: 3046 extractor = self._create_component_from_model( 3047 model=model.extractor, decoder=decoder, config=config 3048 ) 3049 record_filter = ( 3050 self._create_component_from_model(model.record_filter, config=config) 3051 if model.record_filter 3052 else None 3053 ) 3054 3055 transform_before_filtering = ( 3056 False if model.transform_before_filtering is None else model.transform_before_filtering 3057 ) 3058 if client_side_incremental_sync: 3059 record_filter = ClientSideIncrementalRecordFilterDecorator( 3060 config=config, 3061 parameters=model.parameters, 3062 condition=model.record_filter.condition 3063 if (model.record_filter and hasattr(model.record_filter, "condition")) 3064 else None, 3065 **client_side_incremental_sync, 3066 ) 3067 transform_before_filtering = ( 3068 True 3069 if model.transform_before_filtering is None 3070 else model.transform_before_filtering 3071 ) 3072 3073 if model.schema_normalization is None: 3074 # default to no schema normalization if not set 3075 model.schema_normalization = SchemaNormalizationModel.None_ 3076 3077 schema_normalization = ( 3078 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3079 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3080 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3081 ) 3082 3083 return RecordSelector( 3084 extractor=extractor, 3085 name=name, 3086 config=config, 3087 record_filter=record_filter, 3088 transformations=transformations or [], 3089 file_uploader=file_uploader, 3090 schema_normalization=schema_normalization, 3091 parameters=model.parameters or {}, 3092 transform_before_filtering=transform_before_filtering, 3093 ) 3094 3095 @staticmethod 3096 def create_remove_fields( 3097 model: RemoveFieldsModel, config: Config, **kwargs: Any 3098 ) -> RemoveFields: 3099 return RemoveFields( 3100 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3101 ) 3102 3103 def create_selective_authenticator( 3104 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3105 ) -> DeclarativeAuthenticator: 3106 authenticators = { 3107 name: self._create_component_from_model(model=auth, config=config) 3108 for name, auth in model.authenticators.items() 3109 } 3110 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3111 return SelectiveAuthenticator( # type: ignore[abstract] 3112 config=config, 3113 authenticators=authenticators, 3114 authenticator_selection_path=model.authenticator_selection_path, 3115 **kwargs, 3116 ) 3117 3118 @staticmethod 3119 def create_legacy_session_token_authenticator( 3120 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3121 ) -> LegacySessionTokenAuthenticator: 3122 return LegacySessionTokenAuthenticator( 3123 api_url=url_base, 3124 header=model.header, 3125 login_url=model.login_url, 3126 password=model.password or "", 3127 session_token=model.session_token or "", 3128 session_token_response_key=model.session_token_response_key or "", 3129 username=model.username or "", 3130 validate_session_url=model.validate_session_url, 3131 config=config, 3132 parameters=model.parameters or {}, 3133 ) 3134 3135 def create_simple_retriever( 3136 self, 3137 model: SimpleRetrieverModel, 3138 config: Config, 3139 *, 3140 name: str, 3141 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3142 stream_slicer: Optional[StreamSlicer], 3143 request_options_provider: Optional[RequestOptionsProvider] = None, 3144 stop_condition_on_cursor: bool = False, 3145 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3146 transformations: List[RecordTransformation], 3147 file_uploader: Optional[DefaultFileUploader] = None, 3148 incremental_sync: Optional[ 3149 Union[ 3150 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 3151 ] 3152 ] = None, 3153 use_cache: Optional[bool] = None, 3154 log_formatter: Optional[Callable[[Response], Any]] = None, 3155 **kwargs: Any, 3156 ) -> SimpleRetriever: 3157 def _get_url() -> str: 3158 """ 3159 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3160 This is needed because the URL is not set until the requester is created. 3161 """ 3162 3163 _url: str = ( 3164 model.requester.url 3165 if hasattr(model.requester, "url") and model.requester.url is not None 3166 else requester.get_url() 3167 ) 3168 _url_base: str = ( 3169 model.requester.url_base 3170 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3171 else requester.get_url_base() 3172 ) 3173 3174 return _url or _url_base 3175 3176 decoder = ( 3177 self._create_component_from_model(model=model.decoder, config=config) 3178 if model.decoder 3179 else JsonDecoder(parameters={}) 3180 ) 3181 record_selector = self._create_component_from_model( 3182 model=model.record_selector, 3183 name=name, 3184 config=config, 3185 decoder=decoder, 3186 transformations=transformations, 3187 client_side_incremental_sync=client_side_incremental_sync, 3188 file_uploader=file_uploader, 3189 ) 3190 3191 query_properties: Optional[QueryProperties] = None 3192 query_properties_key: Optional[str] = None 3193 if self._query_properties_in_request_parameters(model.requester): 3194 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3195 # places instead of default to request_parameters which isn't clearly documented 3196 if ( 3197 hasattr(model.requester, "fetch_properties_from_endpoint") 3198 and model.requester.fetch_properties_from_endpoint 3199 ): 3200 raise ValueError( 3201 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3202 ) 3203 3204 query_properties_definitions = [] 3205 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3206 if isinstance(request_parameter, QueryPropertiesModel): 3207 query_properties_key = key 3208 query_properties_definitions.append(request_parameter) 3209 3210 if len(query_properties_definitions) > 1: 3211 raise ValueError( 3212 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3213 ) 3214 3215 if len(query_properties_definitions) == 1: 3216 query_properties = self._create_component_from_model( 3217 model=query_properties_definitions[0], config=config 3218 ) 3219 elif ( 3220 hasattr(model.requester, "fetch_properties_from_endpoint") 3221 and model.requester.fetch_properties_from_endpoint 3222 ): 3223 query_properties_definition = QueryPropertiesModel( 3224 type="QueryProperties", 3225 property_list=model.requester.fetch_properties_from_endpoint, 3226 always_include_properties=None, 3227 property_chunking=None, 3228 ) # type: ignore # $parameters has a default value 3229 3230 query_properties = self.create_query_properties( 3231 model=query_properties_definition, 3232 config=config, 3233 ) 3234 3235 requester = self._create_component_from_model( 3236 model=model.requester, 3237 decoder=decoder, 3238 name=name, 3239 query_properties_key=query_properties_key, 3240 use_cache=use_cache, 3241 config=config, 3242 ) 3243 3244 # Define cursor only if per partition or common incremental support is needed 3245 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3246 3247 if ( 3248 not isinstance(stream_slicer, DatetimeBasedCursor) 3249 or type(stream_slicer) is not DatetimeBasedCursor 3250 ): 3251 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3252 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3253 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3254 # request_options_provider 3255 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3256 elif not request_options_provider: 3257 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3258 3259 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3260 if self._should_limit_slices_fetched(): 3261 stream_slicer = cast( 3262 StreamSlicer, 3263 StreamSlicerTestReadDecorator( 3264 wrapped_slicer=stream_slicer, 3265 maximum_number_of_slices=self._limit_slices_fetched or 5, 3266 ), 3267 ) 3268 3269 cursor_used_for_stop_condition = cursor if stop_condition_on_cursor else None 3270 paginator = ( 3271 self._create_component_from_model( 3272 model=model.paginator, 3273 config=config, 3274 url_base=_get_url(), 3275 extractor_model=model.record_selector.extractor, 3276 decoder=decoder, 3277 cursor_used_for_stop_condition=cursor_used_for_stop_condition, 3278 ) 3279 if model.paginator 3280 else NoPagination(parameters={}) 3281 ) 3282 3283 ignore_stream_slicer_parameters_on_paginated_requests = ( 3284 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3285 ) 3286 3287 if ( 3288 model.partition_router 3289 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3290 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3291 and any( 3292 parent_stream_config.lazy_read_pointer 3293 for parent_stream_config in model.partition_router.parent_stream_configs 3294 ) 3295 ): 3296 if incremental_sync: 3297 if incremental_sync.type != "DatetimeBasedCursor": 3298 raise ValueError( 3299 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3300 ) 3301 3302 elif incremental_sync.step or incremental_sync.cursor_granularity: 3303 raise ValueError( 3304 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3305 ) 3306 3307 if model.decoder and model.decoder.type != "JsonDecoder": 3308 raise ValueError( 3309 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3310 ) 3311 3312 return LazySimpleRetriever( 3313 name=name, 3314 paginator=paginator, 3315 primary_key=primary_key, 3316 requester=requester, 3317 record_selector=record_selector, 3318 stream_slicer=stream_slicer, 3319 request_option_provider=request_options_provider, 3320 cursor=cursor, 3321 config=config, 3322 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3323 parameters=model.parameters or {}, 3324 ) 3325 3326 return SimpleRetriever( 3327 name=name, 3328 paginator=paginator, 3329 primary_key=primary_key, 3330 requester=requester, 3331 record_selector=record_selector, 3332 stream_slicer=stream_slicer, 3333 request_option_provider=request_options_provider, 3334 cursor=cursor, 3335 config=config, 3336 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3337 additional_query_properties=query_properties, 3338 log_formatter=self._get_log_formatter(log_formatter, name), 3339 parameters=model.parameters or {}, 3340 ) 3341 3342 def _get_log_formatter( 3343 self, log_formatter: Callable[[Response], Any] | None, name: str 3344 ) -> Callable[[Response], Any] | None: 3345 if self._should_limit_slices_fetched(): 3346 return ( 3347 ( 3348 lambda response: format_http_message( 3349 response, 3350 f"Stream '{name}' request", 3351 f"Request performed in order to extract records for stream '{name}'", 3352 name, 3353 ) 3354 ) 3355 if not log_formatter 3356 else log_formatter 3357 ) 3358 return None 3359 3360 def _should_limit_slices_fetched(self) -> bool: 3361 """ 3362 Returns True if the number of slices fetched should be limited, False otherwise. 3363 This is used to limit the number of slices fetched during tests. 3364 """ 3365 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3366 3367 @staticmethod 3368 def _query_properties_in_request_parameters( 3369 requester: Union[HttpRequesterModel, CustomRequesterModel], 3370 ) -> bool: 3371 if not hasattr(requester, "request_parameters"): 3372 return False 3373 request_parameters = requester.request_parameters 3374 if request_parameters and isinstance(request_parameters, Mapping): 3375 for request_parameter in request_parameters.values(): 3376 if isinstance(request_parameter, QueryPropertiesModel): 3377 return True 3378 return False 3379 3380 @staticmethod 3381 def _remove_query_properties( 3382 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3383 ) -> Mapping[str, str]: 3384 return { 3385 parameter_field: request_parameter 3386 for parameter_field, request_parameter in request_parameters.items() 3387 if not isinstance(request_parameter, QueryPropertiesModel) 3388 } 3389 3390 def create_state_delegating_stream( 3391 self, 3392 model: StateDelegatingStreamModel, 3393 config: Config, 3394 has_parent_state: Optional[bool] = None, 3395 **kwargs: Any, 3396 ) -> DeclarativeStream: 3397 if ( 3398 model.full_refresh_stream.name != model.name 3399 or model.name != model.incremental_stream.name 3400 ): 3401 raise ValueError( 3402 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3403 ) 3404 3405 stream_model = ( 3406 model.incremental_stream 3407 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3408 else model.full_refresh_stream 3409 ) 3410 3411 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description 3412 3413 def _create_async_job_status_mapping( 3414 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3415 ) -> Mapping[str, AsyncJobStatus]: 3416 api_status_to_cdk_status = {} 3417 for cdk_status, api_statuses in model.dict().items(): 3418 if cdk_status == "type": 3419 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3420 continue 3421 3422 for status in api_statuses: 3423 if status in api_status_to_cdk_status: 3424 raise ValueError( 3425 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3426 ) 3427 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3428 return api_status_to_cdk_status 3429 3430 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3431 match status: 3432 case "running": 3433 return AsyncJobStatus.RUNNING 3434 case "completed": 3435 return AsyncJobStatus.COMPLETED 3436 case "failed": 3437 return AsyncJobStatus.FAILED 3438 case "timeout": 3439 return AsyncJobStatus.TIMED_OUT 3440 case _: 3441 raise ValueError(f"Unsupported CDK status {status}") 3442 3443 def create_async_retriever( 3444 self, 3445 model: AsyncRetrieverModel, 3446 config: Config, 3447 *, 3448 name: str, 3449 primary_key: Optional[ 3450 Union[str, List[str], List[List[str]]] 3451 ], # this seems to be needed to match create_simple_retriever 3452 stream_slicer: Optional[StreamSlicer], 3453 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3454 transformations: List[RecordTransformation], 3455 **kwargs: Any, 3456 ) -> AsyncRetriever: 3457 def _get_download_retriever() -> SimpleRetriever: 3458 # We create a record selector for the download retriever 3459 # with no schema normalization and no transformations, neither record filter 3460 # as all this occurs in the record_selector of the AsyncRetriever 3461 record_selector = RecordSelector( 3462 extractor=download_extractor, 3463 name=name, 3464 record_filter=None, 3465 transformations=[], 3466 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3467 config=config, 3468 parameters={}, 3469 ) 3470 paginator = ( 3471 self._create_component_from_model( 3472 model=model.download_paginator, 3473 decoder=decoder, 3474 config=config, 3475 url_base="", 3476 ) 3477 if model.download_paginator 3478 else NoPagination(parameters={}) 3479 ) 3480 3481 return SimpleRetriever( 3482 requester=download_requester, 3483 record_selector=record_selector, 3484 primary_key=None, 3485 name=job_download_components_name, 3486 paginator=paginator, 3487 config=config, 3488 parameters={}, 3489 ) 3490 3491 def _get_job_timeout() -> datetime.timedelta: 3492 user_defined_timeout: Optional[int] = ( 3493 int( 3494 InterpolatedString.create( 3495 str(model.polling_job_timeout), 3496 parameters={}, 3497 ).eval(config) 3498 ) 3499 if model.polling_job_timeout 3500 else None 3501 ) 3502 3503 # check for user defined timeout during the test read or 15 minutes 3504 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3505 # default value for non-connector builder is 60 minutes. 3506 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3507 3508 return ( 3509 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3510 ) 3511 3512 decoder = ( 3513 self._create_component_from_model(model=model.decoder, config=config) 3514 if model.decoder 3515 else JsonDecoder(parameters={}) 3516 ) 3517 record_selector = self._create_component_from_model( 3518 model=model.record_selector, 3519 config=config, 3520 decoder=decoder, 3521 name=name, 3522 transformations=transformations, 3523 client_side_incremental_sync=client_side_incremental_sync, 3524 ) 3525 3526 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3527 if self._should_limit_slices_fetched(): 3528 stream_slicer = cast( 3529 StreamSlicer, 3530 StreamSlicerTestReadDecorator( 3531 wrapped_slicer=stream_slicer, 3532 maximum_number_of_slices=self._limit_slices_fetched or 5, 3533 ), 3534 ) 3535 3536 creation_requester = self._create_component_from_model( 3537 model=model.creation_requester, 3538 decoder=decoder, 3539 config=config, 3540 name=f"job creation - {name}", 3541 ) 3542 polling_requester = self._create_component_from_model( 3543 model=model.polling_requester, 3544 decoder=decoder, 3545 config=config, 3546 name=f"job polling - {name}", 3547 ) 3548 job_download_components_name = f"job download - {name}" 3549 download_decoder = ( 3550 self._create_component_from_model(model=model.download_decoder, config=config) 3551 if model.download_decoder 3552 else JsonDecoder(parameters={}) 3553 ) 3554 download_extractor = ( 3555 self._create_component_from_model( 3556 model=model.download_extractor, 3557 config=config, 3558 decoder=download_decoder, 3559 parameters=model.parameters, 3560 ) 3561 if model.download_extractor 3562 else DpathExtractor( 3563 [], 3564 config=config, 3565 decoder=download_decoder, 3566 parameters=model.parameters or {}, 3567 ) 3568 ) 3569 download_requester = self._create_component_from_model( 3570 model=model.download_requester, 3571 decoder=download_decoder, 3572 config=config, 3573 name=job_download_components_name, 3574 ) 3575 download_retriever = _get_download_retriever() 3576 abort_requester = ( 3577 self._create_component_from_model( 3578 model=model.abort_requester, 3579 decoder=decoder, 3580 config=config, 3581 name=f"job abort - {name}", 3582 ) 3583 if model.abort_requester 3584 else None 3585 ) 3586 delete_requester = ( 3587 self._create_component_from_model( 3588 model=model.delete_requester, 3589 decoder=decoder, 3590 config=config, 3591 name=f"job delete - {name}", 3592 ) 3593 if model.delete_requester 3594 else None 3595 ) 3596 download_target_requester = ( 3597 self._create_component_from_model( 3598 model=model.download_target_requester, 3599 decoder=decoder, 3600 config=config, 3601 name=f"job extract_url - {name}", 3602 ) 3603 if model.download_target_requester 3604 else None 3605 ) 3606 status_extractor = self._create_component_from_model( 3607 model=model.status_extractor, decoder=decoder, config=config, name=name 3608 ) 3609 download_target_extractor = self._create_component_from_model( 3610 model=model.download_target_extractor, 3611 decoder=decoder, 3612 config=config, 3613 name=name, 3614 ) 3615 3616 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3617 creation_requester=creation_requester, 3618 polling_requester=polling_requester, 3619 download_retriever=download_retriever, 3620 download_target_requester=download_target_requester, 3621 abort_requester=abort_requester, 3622 delete_requester=delete_requester, 3623 status_extractor=status_extractor, 3624 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3625 download_target_extractor=download_target_extractor, 3626 job_timeout=_get_job_timeout(), 3627 ) 3628 3629 async_job_partition_router = AsyncJobPartitionRouter( 3630 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3631 job_repository, 3632 stream_slices, 3633 self._job_tracker, 3634 self._message_repository, 3635 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3636 has_bulk_parent=False, 3637 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3638 # `None` == default retry is set to 3 attempts, under the hood. 3639 job_max_retry=1 if self._emit_connector_builder_messages else None, 3640 ), 3641 stream_slicer=stream_slicer, 3642 config=config, 3643 parameters=model.parameters or {}, 3644 ) 3645 3646 return AsyncRetriever( 3647 record_selector=record_selector, 3648 stream_slicer=async_job_partition_router, 3649 config=config, 3650 parameters=model.parameters or {}, 3651 ) 3652 3653 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3654 config_migrations = [ 3655 self._create_component_from_model(migration, config) 3656 for migration in ( 3657 model.config_normalization_rules.config_migrations 3658 if ( 3659 model.config_normalization_rules 3660 and model.config_normalization_rules.config_migrations 3661 ) 3662 else [] 3663 ) 3664 ] 3665 config_transformations = [ 3666 self._create_component_from_model(transformation, config) 3667 for transformation in ( 3668 model.config_normalization_rules.transformations 3669 if ( 3670 model.config_normalization_rules 3671 and model.config_normalization_rules.transformations 3672 ) 3673 else [] 3674 ) 3675 ] 3676 config_validations = [ 3677 self._create_component_from_model(validation, config) 3678 for validation in ( 3679 model.config_normalization_rules.validations 3680 if ( 3681 model.config_normalization_rules 3682 and model.config_normalization_rules.validations 3683 ) 3684 else [] 3685 ) 3686 ] 3687 3688 return Spec( 3689 connection_specification=model.connection_specification, 3690 documentation_url=model.documentation_url, 3691 advanced_auth=model.advanced_auth, 3692 parameters={}, 3693 config_migrations=config_migrations, 3694 config_transformations=config_transformations, 3695 config_validations=config_validations, 3696 ) 3697 3698 def create_substream_partition_router( 3699 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3700 ) -> SubstreamPartitionRouter: 3701 parent_stream_configs = [] 3702 if model.parent_stream_configs: 3703 parent_stream_configs.extend( 3704 [ 3705 self._create_message_repository_substream_wrapper( 3706 model=parent_stream_config, config=config, **kwargs 3707 ) 3708 for parent_stream_config in model.parent_stream_configs 3709 ] 3710 ) 3711 3712 return SubstreamPartitionRouter( 3713 parent_stream_configs=parent_stream_configs, 3714 parameters=model.parameters or {}, 3715 config=config, 3716 ) 3717 3718 def _create_message_repository_substream_wrapper( 3719 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 3720 ) -> Any: 3721 substream_factory = ModelToComponentFactory( 3722 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3723 limit_slices_fetched=self._limit_slices_fetched, 3724 emit_connector_builder_messages=self._emit_connector_builder_messages, 3725 disable_retries=self._disable_retries, 3726 disable_cache=self._disable_cache, 3727 message_repository=LogAppenderMessageRepositoryDecorator( 3728 {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}}, 3729 self._message_repository, 3730 self._evaluate_log_level(self._emit_connector_builder_messages), 3731 ), 3732 ) 3733 3734 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3735 has_parent_state = bool( 3736 self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None) 3737 if model.incremental_dependency 3738 else False 3739 ) 3740 return substream_factory._create_component_from_model( 3741 model=model, config=config, has_parent_state=has_parent_state, **kwargs 3742 ) 3743 3744 @staticmethod 3745 def create_wait_time_from_header( 3746 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3747 ) -> WaitTimeFromHeaderBackoffStrategy: 3748 return WaitTimeFromHeaderBackoffStrategy( 3749 header=model.header, 3750 parameters=model.parameters or {}, 3751 config=config, 3752 regex=model.regex, 3753 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3754 if model.max_waiting_time_in_seconds is not None 3755 else None, 3756 ) 3757 3758 @staticmethod 3759 def create_wait_until_time_from_header( 3760 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3761 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3762 return WaitUntilTimeFromHeaderBackoffStrategy( 3763 header=model.header, 3764 parameters=model.parameters or {}, 3765 config=config, 3766 min_wait=model.min_wait, 3767 regex=model.regex, 3768 ) 3769 3770 def get_message_repository(self) -> MessageRepository: 3771 return self._message_repository 3772 3773 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3774 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3775 3776 @staticmethod 3777 def create_components_mapping_definition( 3778 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3779 ) -> ComponentMappingDefinition: 3780 interpolated_value = InterpolatedString.create( 3781 model.value, parameters=model.parameters or {} 3782 ) 3783 field_path = [ 3784 InterpolatedString.create(path, parameters=model.parameters or {}) 3785 for path in model.field_path 3786 ] 3787 return ComponentMappingDefinition( 3788 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3789 value=interpolated_value, 3790 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3791 create_or_update=model.create_or_update, 3792 parameters=model.parameters or {}, 3793 ) 3794 3795 def create_http_components_resolver( 3796 self, model: HttpComponentsResolverModel, config: Config 3797 ) -> Any: 3798 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3799 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3800 3801 retriever = self._create_component_from_model( 3802 model=model.retriever, 3803 config=config, 3804 name="", 3805 primary_key=None, 3806 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3807 transformations=[], 3808 ) 3809 3810 components_mapping = [ 3811 self._create_component_from_model( 3812 model=components_mapping_definition_model, 3813 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3814 components_mapping_definition_model.value_type 3815 ), 3816 config=config, 3817 ) 3818 for components_mapping_definition_model in model.components_mapping 3819 ] 3820 3821 return HttpComponentsResolver( 3822 retriever=retriever, 3823 config=config, 3824 components_mapping=components_mapping, 3825 parameters=model.parameters or {}, 3826 ) 3827 3828 @staticmethod 3829 def create_stream_config( 3830 model: StreamConfigModel, config: Config, **kwargs: Any 3831 ) -> StreamConfig: 3832 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3833 [x for x in model.configs_pointer] if model.configs_pointer else [] 3834 ) 3835 3836 return StreamConfig( 3837 configs_pointer=model_configs_pointer, 3838 default_values=model.default_values, 3839 parameters=model.parameters or {}, 3840 ) 3841 3842 def create_config_components_resolver( 3843 self, model: ConfigComponentsResolverModel, config: Config 3844 ) -> Any: 3845 model_stream_configs = ( 3846 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3847 ) 3848 3849 stream_configs = [ 3850 self._create_component_from_model( 3851 stream_config, config=config, parameters=model.parameters or {} 3852 ) 3853 for stream_config in model_stream_configs 3854 ] 3855 3856 components_mapping = [ 3857 self._create_component_from_model( 3858 model=components_mapping_definition_model, 3859 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3860 components_mapping_definition_model.value_type 3861 ), 3862 config=config, 3863 ) 3864 for components_mapping_definition_model in model.components_mapping 3865 ] 3866 3867 return ConfigComponentsResolver( 3868 stream_configs=stream_configs, 3869 config=config, 3870 components_mapping=components_mapping, 3871 parameters=model.parameters or {}, 3872 ) 3873 3874 def create_parametrized_components_resolver( 3875 self, model: ParametrizedComponentsResolverModel, config: Config 3876 ) -> ParametrizedComponentsResolver: 3877 stream_parameters = StreamParametersDefinition( 3878 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 3879 ) 3880 components_mapping = [ 3881 self._create_component_from_model( 3882 model=components_mapping_definition_model, 3883 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3884 components_mapping_definition_model.value_type 3885 ), 3886 config=config, 3887 ) 3888 for components_mapping_definition_model in model.components_mapping 3889 ] 3890 return ParametrizedComponentsResolver( 3891 stream_parameters=stream_parameters, 3892 config=config, 3893 components_mapping=components_mapping, 3894 parameters=model.parameters or {}, 3895 ) 3896 3897 _UNSUPPORTED_DECODER_ERROR = ( 3898 "Specified decoder of {decoder_type} is not supported for pagination." 3899 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 3900 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 3901 ) 3902 3903 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 3904 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 3905 return True 3906 elif isinstance(decoder, CompositeRawDecoder): 3907 return self._is_supported_parser_for_pagination(decoder.parser) 3908 else: 3909 return False 3910 3911 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 3912 if isinstance(parser, JsonParser): 3913 return True 3914 elif isinstance(parser, GzipParser): 3915 return isinstance(parser.inner_parser, JsonParser) 3916 else: 3917 return False 3918 3919 def create_http_api_budget( 3920 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3921 ) -> HttpAPIBudget: 3922 policies = [ 3923 self._create_component_from_model(model=policy, config=config) 3924 for policy in model.policies 3925 ] 3926 3927 return HttpAPIBudget( 3928 policies=policies, 3929 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 3930 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 3931 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 3932 ) 3933 3934 def create_fixed_window_call_rate_policy( 3935 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 3936 ) -> FixedWindowCallRatePolicy: 3937 matchers = [ 3938 self._create_component_from_model(model=matcher, config=config) 3939 for matcher in model.matchers 3940 ] 3941 3942 # Set the initial reset timestamp to 10 days from now. 3943 # This value will be updated by the first request. 3944 return FixedWindowCallRatePolicy( 3945 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 3946 period=parse_duration(model.period), 3947 call_limit=model.call_limit, 3948 matchers=matchers, 3949 ) 3950 3951 def create_file_uploader( 3952 self, model: FileUploaderModel, config: Config, **kwargs: Any 3953 ) -> FileUploader: 3954 name = "File Uploader" 3955 requester = self._create_component_from_model( 3956 model=model.requester, 3957 config=config, 3958 name=name, 3959 **kwargs, 3960 ) 3961 download_target_extractor = self._create_component_from_model( 3962 model=model.download_target_extractor, 3963 config=config, 3964 name=name, 3965 **kwargs, 3966 ) 3967 emit_connector_builder_messages = self._emit_connector_builder_messages 3968 file_uploader = DefaultFileUploader( 3969 requester=requester, 3970 download_target_extractor=download_target_extractor, 3971 config=config, 3972 file_writer=NoopFileWriter() 3973 if emit_connector_builder_messages 3974 else LocalFileSystemFileWriter(), 3975 parameters=model.parameters or {}, 3976 filename_extractor=model.filename_extractor if model.filename_extractor else None, 3977 ) 3978 3979 return ( 3980 ConnectorBuilderFileUploader(file_uploader) 3981 if emit_connector_builder_messages 3982 else file_uploader 3983 ) 3984 3985 def create_moving_window_call_rate_policy( 3986 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 3987 ) -> MovingWindowCallRatePolicy: 3988 rates = [ 3989 self._create_component_from_model(model=rate, config=config) for rate in model.rates 3990 ] 3991 matchers = [ 3992 self._create_component_from_model(model=matcher, config=config) 3993 for matcher in model.matchers 3994 ] 3995 return MovingWindowCallRatePolicy( 3996 rates=rates, 3997 matchers=matchers, 3998 ) 3999 4000 def create_unlimited_call_rate_policy( 4001 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4002 ) -> UnlimitedCallRatePolicy: 4003 matchers = [ 4004 self._create_component_from_model(model=matcher, config=config) 4005 for matcher in model.matchers 4006 ] 4007 4008 return UnlimitedCallRatePolicy( 4009 matchers=matchers, 4010 ) 4011 4012 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4013 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4014 return Rate( 4015 limit=int(interpolated_limit.eval(config=config)), 4016 interval=parse_duration(model.interval), 4017 ) 4018 4019 def create_http_request_matcher( 4020 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4021 ) -> HttpRequestRegexMatcher: 4022 return HttpRequestRegexMatcher( 4023 method=model.method, 4024 url_base=model.url_base, 4025 url_path_pattern=model.url_path_pattern, 4026 params=model.params, 4027 headers=model.headers, 4028 ) 4029 4030 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4031 self._api_budget = self.create_component( 4032 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4033 ) 4034 4035 def create_grouping_partition_router( 4036 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 4037 ) -> GroupingPartitionRouter: 4038 underlying_router = self._create_component_from_model( 4039 model=model.underlying_partition_router, config=config 4040 ) 4041 if model.group_size < 1: 4042 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4043 4044 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4045 # because they are specific to individual partitions and cannot be aggregated or handled 4046 # when grouping, potentially leading to incorrect API calls. Any request customization 4047 # should be managed at the stream level through the requester's configuration. 4048 if isinstance(underlying_router, SubstreamPartitionRouter): 4049 if any( 4050 parent_config.request_option 4051 for parent_config in underlying_router.parent_stream_configs 4052 ): 4053 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4054 4055 if isinstance(underlying_router, ListPartitionRouter): 4056 if underlying_router.request_option: 4057 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4058 4059 return GroupingPartitionRouter( 4060 group_size=model.group_size, 4061 underlying_partition_router=underlying_router, 4062 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4063 config=config, 4064 )
625class ModelToComponentFactory: 626 EPOCH_DATETIME_FORMAT = "%s" 627 628 def __init__( 629 self, 630 limit_pages_fetched_per_slice: Optional[int] = None, 631 limit_slices_fetched: Optional[int] = None, 632 emit_connector_builder_messages: bool = False, 633 disable_retries: bool = False, 634 disable_cache: bool = False, 635 disable_resumable_full_refresh: bool = False, 636 message_repository: Optional[MessageRepository] = None, 637 connector_state_manager: Optional[ConnectorStateManager] = None, 638 max_concurrent_async_job_count: Optional[int] = None, 639 ): 640 self._init_mappings() 641 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 642 self._limit_slices_fetched = limit_slices_fetched 643 self._emit_connector_builder_messages = emit_connector_builder_messages 644 self._disable_retries = disable_retries 645 self._disable_cache = disable_cache 646 self._disable_resumable_full_refresh = disable_resumable_full_refresh 647 self._message_repository = message_repository or InMemoryMessageRepository( 648 self._evaluate_log_level(emit_connector_builder_messages) 649 ) 650 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 651 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 652 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 653 # placeholder for deprecation warnings 654 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 655 656 def _init_mappings(self) -> None: 657 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 658 AddedFieldDefinitionModel: self.create_added_field_definition, 659 AddFieldsModel: self.create_add_fields, 660 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 661 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 662 BearerAuthenticatorModel: self.create_bearer_authenticator, 663 CheckStreamModel: self.create_check_stream, 664 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 665 CheckDynamicStreamModel: self.create_check_dynamic_stream, 666 CompositeErrorHandlerModel: self.create_composite_error_handler, 667 ConcurrencyLevelModel: self.create_concurrency_level, 668 ConfigMigrationModel: self.create_config_migration, 669 ConfigAddFieldsModel: self.create_config_add_fields, 670 ConfigRemapFieldModel: self.create_config_remap_field, 671 ConfigRemoveFieldsModel: self.create_config_remove_fields, 672 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 673 CsvDecoderModel: self.create_csv_decoder, 674 CursorPaginationModel: self.create_cursor_pagination, 675 CustomAuthenticatorModel: self.create_custom_component, 676 CustomBackoffStrategyModel: self.create_custom_component, 677 CustomDecoderModel: self.create_custom_component, 678 CustomErrorHandlerModel: self.create_custom_component, 679 CustomIncrementalSyncModel: self.create_custom_component, 680 CustomRecordExtractorModel: self.create_custom_component, 681 CustomRecordFilterModel: self.create_custom_component, 682 CustomRequesterModel: self.create_custom_component, 683 CustomRetrieverModel: self.create_custom_component, 684 CustomSchemaLoader: self.create_custom_component, 685 CustomSchemaNormalizationModel: self.create_custom_component, 686 CustomStateMigration: self.create_custom_component, 687 CustomPaginationStrategyModel: self.create_custom_component, 688 CustomPartitionRouterModel: self.create_custom_component, 689 CustomTransformationModel: self.create_custom_component, 690 CustomValidationStrategyModel: self.create_custom_component, 691 DatetimeBasedCursorModel: self.create_datetime_based_cursor, 692 DeclarativeStreamModel: self.create_declarative_stream, 693 DefaultErrorHandlerModel: self.create_default_error_handler, 694 DefaultPaginatorModel: self.create_default_paginator, 695 DpathExtractorModel: self.create_dpath_extractor, 696 DpathValidatorModel: self.create_dpath_validator, 697 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 698 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 699 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 700 GroupByKeyMergeStrategyModel: self.create_group_by_key, 701 HttpRequesterModel: self.create_http_requester, 702 HttpResponseFilterModel: self.create_http_response_filter, 703 InlineSchemaLoaderModel: self.create_inline_schema_loader, 704 JsonDecoderModel: self.create_json_decoder, 705 JsonlDecoderModel: self.create_jsonl_decoder, 706 GzipDecoderModel: self.create_gzip_decoder, 707 KeysToLowerModel: self.create_keys_to_lower_transformation, 708 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 709 KeysReplaceModel: self.create_keys_replace_transformation, 710 FlattenFieldsModel: self.create_flatten_fields, 711 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 712 IterableDecoderModel: self.create_iterable_decoder, 713 IncrementingCountCursorModel: self.create_incrementing_count_cursor, 714 XmlDecoderModel: self.create_xml_decoder, 715 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 716 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 717 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 718 TypesMapModel: self.create_types_map, 719 ComplexFieldTypeModel: self.create_complex_field_type, 720 JwtAuthenticatorModel: self.create_jwt_authenticator, 721 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 722 ListPartitionRouterModel: self.create_list_partition_router, 723 MinMaxDatetimeModel: self.create_min_max_datetime, 724 NoAuthModel: self.create_no_auth, 725 NoPaginationModel: self.create_no_pagination, 726 OAuthAuthenticatorModel: self.create_oauth_authenticator, 727 OffsetIncrementModel: self.create_offset_increment, 728 PageIncrementModel: self.create_page_increment, 729 ParentStreamConfigModel: self.create_parent_stream_config, 730 PredicateValidatorModel: self.create_predicate_validator, 731 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 732 PropertyChunkingModel: self.create_property_chunking, 733 QueryPropertiesModel: self.create_query_properties, 734 RecordFilterModel: self.create_record_filter, 735 RecordSelectorModel: self.create_record_selector, 736 RemoveFieldsModel: self.create_remove_fields, 737 RequestPathModel: self.create_request_path, 738 RequestOptionModel: self.create_request_option, 739 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 740 SelectiveAuthenticatorModel: self.create_selective_authenticator, 741 SimpleRetrieverModel: self.create_simple_retriever, 742 StateDelegatingStreamModel: self.create_state_delegating_stream, 743 SpecModel: self.create_spec, 744 SubstreamPartitionRouterModel: self.create_substream_partition_router, 745 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 746 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 747 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 748 AsyncRetrieverModel: self.create_async_retriever, 749 HttpComponentsResolverModel: self.create_http_components_resolver, 750 ConfigComponentsResolverModel: self.create_config_components_resolver, 751 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 752 StreamConfigModel: self.create_stream_config, 753 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 754 ZipfileDecoderModel: self.create_zipfile_decoder, 755 HTTPAPIBudgetModel: self.create_http_api_budget, 756 FileUploaderModel: self.create_file_uploader, 757 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 758 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 759 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 760 RateModel: self.create_rate, 761 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 762 GroupingPartitionRouterModel: self.create_grouping_partition_router, 763 } 764 765 # Needed for the case where we need to perform a second parse on the fields of a custom component 766 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 767 768 def create_component( 769 self, 770 model_type: Type[BaseModel], 771 component_definition: ComponentDefinition, 772 config: Config, 773 **kwargs: Any, 774 ) -> Any: 775 """ 776 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 777 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 778 creating declarative components from that model. 779 780 :param model_type: The type of declarative component that is being initialized 781 :param component_definition: The mapping that represents a declarative component 782 :param config: The connector config that is provided by the customer 783 :return: The declarative component to be used at runtime 784 """ 785 786 component_type = component_definition.get("type") 787 if component_definition.get("type") != model_type.__name__: 788 raise ValueError( 789 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 790 ) 791 792 declarative_component_model = model_type.parse_obj(component_definition) 793 794 if not isinstance(declarative_component_model, model_type): 795 raise ValueError( 796 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 797 ) 798 799 return self._create_component_from_model( 800 model=declarative_component_model, config=config, **kwargs 801 ) 802 803 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 804 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 805 raise ValueError( 806 f"{model.__class__} with attributes {model} is not a valid component type" 807 ) 808 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 809 if not component_constructor: 810 raise ValueError(f"Could not find constructor for {model.__class__}") 811 812 # collect deprecation warnings for supported models. 813 if isinstance(model, BaseModelWithDeprecations): 814 self._collect_model_deprecations(model) 815 816 return component_constructor(model=model, config=config, **kwargs) 817 818 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 819 """ 820 Returns the deprecation warnings that were collected during the creation of components. 821 """ 822 return self._collected_deprecation_logs 823 824 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 825 """ 826 Collects deprecation logs from the given model and appends any new logs to the internal collection. 827 828 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 829 830 Args: 831 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 832 """ 833 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 834 for log in model._deprecation_logs: 835 # avoid duplicates for deprecation logs observed. 836 if log not in self._collected_deprecation_logs: 837 self._collected_deprecation_logs.append(log) 838 839 def create_config_migration( 840 self, model: ConfigMigrationModel, config: Config 841 ) -> ConfigMigration: 842 transformations: List[ConfigTransformation] = [ 843 self._create_component_from_model(transformation, config) 844 for transformation in model.transformations 845 ] 846 847 return ConfigMigration( 848 description=model.description, 849 transformations=transformations, 850 ) 851 852 def create_config_add_fields( 853 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 854 ) -> ConfigAddFields: 855 fields = [self._create_component_from_model(field, config) for field in model.fields] 856 return ConfigAddFields( 857 fields=fields, 858 condition=model.condition or "", 859 ) 860 861 @staticmethod 862 def create_config_remove_fields( 863 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 864 ) -> ConfigRemoveFields: 865 return ConfigRemoveFields( 866 field_pointers=model.field_pointers, 867 condition=model.condition or "", 868 ) 869 870 @staticmethod 871 def create_config_remap_field( 872 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 873 ) -> ConfigRemapField: 874 mapping = cast(Mapping[str, Any], model.map) 875 return ConfigRemapField( 876 map=mapping, 877 field_path=model.field_path, 878 config=config, 879 ) 880 881 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 882 strategy = self._create_component_from_model(model.validation_strategy, config) 883 884 return DpathValidator( 885 field_path=model.field_path, 886 strategy=strategy, 887 ) 888 889 def create_predicate_validator( 890 self, model: PredicateValidatorModel, config: Config 891 ) -> PredicateValidator: 892 strategy = self._create_component_from_model(model.validation_strategy, config) 893 894 return PredicateValidator( 895 value=model.value, 896 strategy=strategy, 897 ) 898 899 @staticmethod 900 def create_validate_adheres_to_schema( 901 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 902 ) -> ValidateAdheresToSchema: 903 base_schema = cast(Mapping[str, Any], model.base_schema) 904 return ValidateAdheresToSchema( 905 schema=base_schema, 906 ) 907 908 @staticmethod 909 def create_added_field_definition( 910 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 911 ) -> AddedFieldDefinition: 912 interpolated_value = InterpolatedString.create( 913 model.value, parameters=model.parameters or {} 914 ) 915 return AddedFieldDefinition( 916 path=model.path, 917 value=interpolated_value, 918 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 919 parameters=model.parameters or {}, 920 ) 921 922 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 923 added_field_definitions = [ 924 self._create_component_from_model( 925 model=added_field_definition_model, 926 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 927 added_field_definition_model.value_type 928 ), 929 config=config, 930 ) 931 for added_field_definition_model in model.fields 932 ] 933 return AddFields( 934 fields=added_field_definitions, 935 condition=model.condition or "", 936 parameters=model.parameters or {}, 937 ) 938 939 def create_keys_to_lower_transformation( 940 self, model: KeysToLowerModel, config: Config, **kwargs: Any 941 ) -> KeysToLowerTransformation: 942 return KeysToLowerTransformation() 943 944 def create_keys_to_snake_transformation( 945 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 946 ) -> KeysToSnakeCaseTransformation: 947 return KeysToSnakeCaseTransformation() 948 949 def create_keys_replace_transformation( 950 self, model: KeysReplaceModel, config: Config, **kwargs: Any 951 ) -> KeysReplaceTransformation: 952 return KeysReplaceTransformation( 953 old=model.old, new=model.new, parameters=model.parameters or {} 954 ) 955 956 def create_flatten_fields( 957 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 958 ) -> FlattenFields: 959 return FlattenFields( 960 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 961 ) 962 963 def create_dpath_flatten_fields( 964 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 965 ) -> DpathFlattenFields: 966 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 967 key_transformation = ( 968 KeyTransformation( 969 config=config, 970 prefix=model.key_transformation.prefix, 971 suffix=model.key_transformation.suffix, 972 parameters=model.parameters or {}, 973 ) 974 if model.key_transformation is not None 975 else None 976 ) 977 return DpathFlattenFields( 978 config=config, 979 field_path=model_field_path, 980 delete_origin_value=model.delete_origin_value 981 if model.delete_origin_value is not None 982 else False, 983 replace_record=model.replace_record if model.replace_record is not None else False, 984 key_transformation=key_transformation, 985 parameters=model.parameters or {}, 986 ) 987 988 @staticmethod 989 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 990 if not value_type: 991 return None 992 names_to_types = { 993 ValueType.string: str, 994 ValueType.number: float, 995 ValueType.integer: int, 996 ValueType.boolean: bool, 997 } 998 return names_to_types[value_type] 999 1000 def create_api_key_authenticator( 1001 self, 1002 model: ApiKeyAuthenticatorModel, 1003 config: Config, 1004 token_provider: Optional[TokenProvider] = None, 1005 **kwargs: Any, 1006 ) -> ApiKeyAuthenticator: 1007 if model.inject_into is None and model.header is None: 1008 raise ValueError( 1009 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1010 ) 1011 1012 if model.inject_into is not None and model.header is not None: 1013 raise ValueError( 1014 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1015 ) 1016 1017 if token_provider is not None and model.api_token != "": 1018 raise ValueError( 1019 "If token_provider is set, api_token is ignored and has to be set to empty string." 1020 ) 1021 1022 request_option = ( 1023 self._create_component_from_model( 1024 model.inject_into, config, parameters=model.parameters or {} 1025 ) 1026 if model.inject_into 1027 else RequestOption( 1028 inject_into=RequestOptionType.header, 1029 field_name=model.header or "", 1030 parameters=model.parameters or {}, 1031 ) 1032 ) 1033 1034 return ApiKeyAuthenticator( 1035 token_provider=( 1036 token_provider 1037 if token_provider is not None 1038 else InterpolatedStringTokenProvider( 1039 api_token=model.api_token or "", 1040 config=config, 1041 parameters=model.parameters or {}, 1042 ) 1043 ), 1044 request_option=request_option, 1045 config=config, 1046 parameters=model.parameters or {}, 1047 ) 1048 1049 def create_legacy_to_per_partition_state_migration( 1050 self, 1051 model: LegacyToPerPartitionStateMigrationModel, 1052 config: Mapping[str, Any], 1053 declarative_stream: DeclarativeStreamModel, 1054 ) -> LegacyToPerPartitionStateMigration: 1055 retriever = declarative_stream.retriever 1056 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1057 raise ValueError( 1058 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1059 ) 1060 partition_router = retriever.partition_router 1061 if not isinstance( 1062 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1063 ): 1064 raise ValueError( 1065 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1066 ) 1067 if not hasattr(partition_router, "parent_stream_configs"): 1068 raise ValueError( 1069 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1070 ) 1071 1072 if not hasattr(declarative_stream, "incremental_sync"): 1073 raise ValueError( 1074 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1075 ) 1076 1077 return LegacyToPerPartitionStateMigration( 1078 partition_router, # type: ignore # was already checked above 1079 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1080 config, 1081 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1082 ) 1083 1084 def create_session_token_authenticator( 1085 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1086 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1087 decoder = ( 1088 self._create_component_from_model(model=model.decoder, config=config) 1089 if model.decoder 1090 else JsonDecoder(parameters={}) 1091 ) 1092 login_requester = self._create_component_from_model( 1093 model=model.login_requester, 1094 config=config, 1095 name=f"{name}_login_requester", 1096 decoder=decoder, 1097 ) 1098 token_provider = SessionTokenProvider( 1099 login_requester=login_requester, 1100 session_token_path=model.session_token_path, 1101 expiration_duration=parse_duration(model.expiration_duration) 1102 if model.expiration_duration 1103 else None, 1104 parameters=model.parameters or {}, 1105 message_repository=self._message_repository, 1106 decoder=decoder, 1107 ) 1108 if model.request_authentication.type == "Bearer": 1109 return ModelToComponentFactory.create_bearer_authenticator( 1110 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1111 config, 1112 token_provider=token_provider, 1113 ) 1114 else: 1115 return self.create_api_key_authenticator( 1116 ApiKeyAuthenticatorModel( 1117 type="ApiKeyAuthenticator", 1118 api_token="", 1119 inject_into=model.request_authentication.inject_into, 1120 ), # type: ignore # $parameters and headers default to None 1121 config=config, 1122 token_provider=token_provider, 1123 ) 1124 1125 @staticmethod 1126 def create_basic_http_authenticator( 1127 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1128 ) -> BasicHttpAuthenticator: 1129 return BasicHttpAuthenticator( 1130 password=model.password or "", 1131 username=model.username, 1132 config=config, 1133 parameters=model.parameters or {}, 1134 ) 1135 1136 @staticmethod 1137 def create_bearer_authenticator( 1138 model: BearerAuthenticatorModel, 1139 config: Config, 1140 token_provider: Optional[TokenProvider] = None, 1141 **kwargs: Any, 1142 ) -> BearerAuthenticator: 1143 if token_provider is not None and model.api_token != "": 1144 raise ValueError( 1145 "If token_provider is set, api_token is ignored and has to be set to empty string." 1146 ) 1147 return BearerAuthenticator( 1148 token_provider=( 1149 token_provider 1150 if token_provider is not None 1151 else InterpolatedStringTokenProvider( 1152 api_token=model.api_token or "", 1153 config=config, 1154 parameters=model.parameters or {}, 1155 ) 1156 ), 1157 config=config, 1158 parameters=model.parameters or {}, 1159 ) 1160 1161 @staticmethod 1162 def create_dynamic_stream_check_config( 1163 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1164 ) -> DynamicStreamCheckConfig: 1165 return DynamicStreamCheckConfig( 1166 dynamic_stream_name=model.dynamic_stream_name, 1167 stream_count=model.stream_count or 0, 1168 ) 1169 1170 def create_check_stream( 1171 self, model: CheckStreamModel, config: Config, **kwargs: Any 1172 ) -> CheckStream: 1173 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1174 raise ValueError( 1175 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1176 ) 1177 1178 dynamic_streams_check_configs = ( 1179 [ 1180 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1181 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1182 ] 1183 if model.dynamic_streams_check_configs 1184 else [] 1185 ) 1186 1187 return CheckStream( 1188 stream_names=model.stream_names or [], 1189 dynamic_streams_check_configs=dynamic_streams_check_configs, 1190 parameters={}, 1191 ) 1192 1193 @staticmethod 1194 def create_check_dynamic_stream( 1195 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1196 ) -> CheckDynamicStream: 1197 assert model.use_check_availability is not None # for mypy 1198 1199 use_check_availability = model.use_check_availability 1200 1201 return CheckDynamicStream( 1202 stream_count=model.stream_count, 1203 use_check_availability=use_check_availability, 1204 parameters={}, 1205 ) 1206 1207 def create_composite_error_handler( 1208 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1209 ) -> CompositeErrorHandler: 1210 error_handlers = [ 1211 self._create_component_from_model(model=error_handler_model, config=config) 1212 for error_handler_model in model.error_handlers 1213 ] 1214 return CompositeErrorHandler( 1215 error_handlers=error_handlers, parameters=model.parameters or {} 1216 ) 1217 1218 @staticmethod 1219 def create_concurrency_level( 1220 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1221 ) -> ConcurrencyLevel: 1222 return ConcurrencyLevel( 1223 default_concurrency=model.default_concurrency, 1224 max_concurrency=model.max_concurrency, 1225 config=config, 1226 parameters={}, 1227 ) 1228 1229 @staticmethod 1230 def apply_stream_state_migrations( 1231 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1232 ) -> MutableMapping[str, Any]: 1233 if stream_state_migrations: 1234 for state_migration in stream_state_migrations: 1235 if state_migration.should_migrate(stream_state): 1236 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1237 stream_state = dict(state_migration.migrate(stream_state)) 1238 return stream_state 1239 1240 def create_concurrent_cursor_from_datetime_based_cursor( 1241 self, 1242 model_type: Type[BaseModel], 1243 component_definition: ComponentDefinition, 1244 stream_name: str, 1245 stream_namespace: Optional[str], 1246 config: Config, 1247 message_repository: Optional[MessageRepository] = None, 1248 runtime_lookback_window: Optional[datetime.timedelta] = None, 1249 stream_state_migrations: Optional[List[Any]] = None, 1250 **kwargs: Any, 1251 ) -> ConcurrentCursor: 1252 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1253 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1254 # incoming state and connector_state_manager that is initialized when the component factory is created 1255 stream_state = ( 1256 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1257 if "stream_state" not in kwargs 1258 else kwargs["stream_state"] 1259 ) 1260 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1261 1262 component_type = component_definition.get("type") 1263 if component_definition.get("type") != model_type.__name__: 1264 raise ValueError( 1265 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1266 ) 1267 1268 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1269 1270 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1271 raise ValueError( 1272 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1273 ) 1274 1275 interpolated_cursor_field = InterpolatedString.create( 1276 datetime_based_cursor_model.cursor_field, 1277 parameters=datetime_based_cursor_model.parameters or {}, 1278 ) 1279 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1280 1281 interpolated_partition_field_start = InterpolatedString.create( 1282 datetime_based_cursor_model.partition_field_start or "start_time", 1283 parameters=datetime_based_cursor_model.parameters or {}, 1284 ) 1285 interpolated_partition_field_end = InterpolatedString.create( 1286 datetime_based_cursor_model.partition_field_end or "end_time", 1287 parameters=datetime_based_cursor_model.parameters or {}, 1288 ) 1289 1290 slice_boundary_fields = ( 1291 interpolated_partition_field_start.eval(config=config), 1292 interpolated_partition_field_end.eval(config=config), 1293 ) 1294 1295 datetime_format = datetime_based_cursor_model.datetime_format 1296 1297 cursor_granularity = ( 1298 parse_duration(datetime_based_cursor_model.cursor_granularity) 1299 if datetime_based_cursor_model.cursor_granularity 1300 else None 1301 ) 1302 1303 lookback_window = None 1304 interpolated_lookback_window = ( 1305 InterpolatedString.create( 1306 datetime_based_cursor_model.lookback_window, 1307 parameters=datetime_based_cursor_model.parameters or {}, 1308 ) 1309 if datetime_based_cursor_model.lookback_window 1310 else None 1311 ) 1312 if interpolated_lookback_window: 1313 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1314 if evaluated_lookback_window: 1315 lookback_window = parse_duration(evaluated_lookback_window) 1316 1317 connector_state_converter: DateTimeStreamStateConverter 1318 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1319 datetime_format=datetime_format, 1320 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1321 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1322 cursor_granularity=cursor_granularity, 1323 ) 1324 1325 # Adjusts the stream state by applying the runtime lookback window. 1326 # This is used to ensure correct state handling in case of failed partitions. 1327 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1328 if runtime_lookback_window and stream_state_value: 1329 new_stream_state = ( 1330 connector_state_converter.parse_timestamp(stream_state_value) 1331 - runtime_lookback_window 1332 ) 1333 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1334 new_stream_state 1335 ) 1336 1337 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1338 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1339 start_date_runtime_value = self.create_min_max_datetime( 1340 model=datetime_based_cursor_model.start_datetime, config=config 1341 ) 1342 else: 1343 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1344 1345 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1346 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1347 end_date_runtime_value = self.create_min_max_datetime( 1348 model=datetime_based_cursor_model.end_datetime, config=config 1349 ) 1350 else: 1351 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1352 1353 interpolated_start_date = MinMaxDatetime.create( 1354 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1355 parameters=datetime_based_cursor_model.parameters, 1356 ) 1357 interpolated_end_date = ( 1358 None 1359 if not end_date_runtime_value 1360 else MinMaxDatetime.create( 1361 end_date_runtime_value, datetime_based_cursor_model.parameters 1362 ) 1363 ) 1364 1365 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1366 if not interpolated_start_date.datetime_format: 1367 interpolated_start_date.datetime_format = datetime_format 1368 if interpolated_end_date and not interpolated_end_date.datetime_format: 1369 interpolated_end_date.datetime_format = datetime_format 1370 1371 start_date = interpolated_start_date.get_datetime(config=config) 1372 end_date_provider = ( 1373 partial(interpolated_end_date.get_datetime, config) 1374 if interpolated_end_date 1375 else connector_state_converter.get_end_provider() 1376 ) 1377 1378 if ( 1379 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1380 ) or ( 1381 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1382 ): 1383 raise ValueError( 1384 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1385 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1386 ) 1387 1388 # When step is not defined, default to a step size from the starting date to the present moment 1389 step_length = datetime.timedelta.max 1390 interpolated_step = ( 1391 InterpolatedString.create( 1392 datetime_based_cursor_model.step, 1393 parameters=datetime_based_cursor_model.parameters or {}, 1394 ) 1395 if datetime_based_cursor_model.step 1396 else None 1397 ) 1398 if interpolated_step: 1399 evaluated_step = interpolated_step.eval(config) 1400 if evaluated_step: 1401 step_length = parse_duration(evaluated_step) 1402 1403 clamping_strategy: ClampingStrategy = NoClamping() 1404 if datetime_based_cursor_model.clamping: 1405 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1406 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1407 # object which we want to keep agnostic of being low-code 1408 target = InterpolatedString( 1409 string=datetime_based_cursor_model.clamping.target, 1410 parameters=datetime_based_cursor_model.parameters or {}, 1411 ) 1412 evaluated_target = target.eval(config=config) 1413 match evaluated_target: 1414 case "DAY": 1415 clamping_strategy = DayClampingStrategy() 1416 end_date_provider = ClampingEndProvider( 1417 DayClampingStrategy(is_ceiling=False), 1418 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1419 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1420 ) 1421 case "WEEK": 1422 if ( 1423 not datetime_based_cursor_model.clamping.target_details 1424 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1425 ): 1426 raise ValueError( 1427 "Given WEEK clamping, weekday needs to be provided as target_details" 1428 ) 1429 weekday = self._assemble_weekday( 1430 datetime_based_cursor_model.clamping.target_details["weekday"] 1431 ) 1432 clamping_strategy = WeekClampingStrategy(weekday) 1433 end_date_provider = ClampingEndProvider( 1434 WeekClampingStrategy(weekday, is_ceiling=False), 1435 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1436 granularity=cursor_granularity or datetime.timedelta(days=1), 1437 ) 1438 case "MONTH": 1439 clamping_strategy = MonthClampingStrategy() 1440 end_date_provider = ClampingEndProvider( 1441 MonthClampingStrategy(is_ceiling=False), 1442 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1443 granularity=cursor_granularity or datetime.timedelta(days=1), 1444 ) 1445 case _: 1446 raise ValueError( 1447 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1448 ) 1449 1450 return ConcurrentCursor( 1451 stream_name=stream_name, 1452 stream_namespace=stream_namespace, 1453 stream_state=stream_state, 1454 message_repository=message_repository or self._message_repository, 1455 connector_state_manager=self._connector_state_manager, 1456 connector_state_converter=connector_state_converter, 1457 cursor_field=cursor_field, 1458 slice_boundary_fields=slice_boundary_fields, 1459 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1460 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1461 lookback_window=lookback_window, 1462 slice_range=step_length, 1463 cursor_granularity=cursor_granularity, 1464 clamping_strategy=clamping_strategy, 1465 ) 1466 1467 def create_concurrent_cursor_from_incrementing_count_cursor( 1468 self, 1469 model_type: Type[BaseModel], 1470 component_definition: ComponentDefinition, 1471 stream_name: str, 1472 stream_namespace: Optional[str], 1473 config: Config, 1474 message_repository: Optional[MessageRepository] = None, 1475 **kwargs: Any, 1476 ) -> ConcurrentCursor: 1477 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1478 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1479 # incoming state and connector_state_manager that is initialized when the component factory is created 1480 stream_state = ( 1481 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1482 if "stream_state" not in kwargs 1483 else kwargs["stream_state"] 1484 ) 1485 1486 component_type = component_definition.get("type") 1487 if component_definition.get("type") != model_type.__name__: 1488 raise ValueError( 1489 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1490 ) 1491 1492 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1493 1494 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1495 raise ValueError( 1496 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1497 ) 1498 1499 interpolated_start_value = ( 1500 InterpolatedString.create( 1501 incrementing_count_cursor_model.start_value, # type: ignore 1502 parameters=incrementing_count_cursor_model.parameters or {}, 1503 ) 1504 if incrementing_count_cursor_model.start_value 1505 else 0 1506 ) 1507 1508 interpolated_cursor_field = InterpolatedString.create( 1509 incrementing_count_cursor_model.cursor_field, 1510 parameters=incrementing_count_cursor_model.parameters or {}, 1511 ) 1512 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1513 1514 connector_state_converter = IncrementingCountStreamStateConverter( 1515 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1516 ) 1517 1518 return ConcurrentCursor( 1519 stream_name=stream_name, 1520 stream_namespace=stream_namespace, 1521 stream_state=stream_state, 1522 message_repository=message_repository or self._message_repository, 1523 connector_state_manager=self._connector_state_manager, 1524 connector_state_converter=connector_state_converter, 1525 cursor_field=cursor_field, 1526 slice_boundary_fields=None, 1527 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1528 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1529 ) 1530 1531 def _assemble_weekday(self, weekday: str) -> Weekday: 1532 match weekday: 1533 case "MONDAY": 1534 return Weekday.MONDAY 1535 case "TUESDAY": 1536 return Weekday.TUESDAY 1537 case "WEDNESDAY": 1538 return Weekday.WEDNESDAY 1539 case "THURSDAY": 1540 return Weekday.THURSDAY 1541 case "FRIDAY": 1542 return Weekday.FRIDAY 1543 case "SATURDAY": 1544 return Weekday.SATURDAY 1545 case "SUNDAY": 1546 return Weekday.SUNDAY 1547 case _: 1548 raise ValueError(f"Unknown weekday {weekday}") 1549 1550 def create_concurrent_cursor_from_perpartition_cursor( 1551 self, 1552 state_manager: ConnectorStateManager, 1553 model_type: Type[BaseModel], 1554 component_definition: ComponentDefinition, 1555 stream_name: str, 1556 stream_namespace: Optional[str], 1557 config: Config, 1558 stream_state: MutableMapping[str, Any], 1559 partition_router: PartitionRouter, 1560 stream_state_migrations: Optional[List[Any]] = None, 1561 **kwargs: Any, 1562 ) -> ConcurrentPerPartitionCursor: 1563 component_type = component_definition.get("type") 1564 if component_definition.get("type") != model_type.__name__: 1565 raise ValueError( 1566 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1567 ) 1568 1569 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1570 1571 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1572 raise ValueError( 1573 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1574 ) 1575 1576 interpolated_cursor_field = InterpolatedString.create( 1577 datetime_based_cursor_model.cursor_field, 1578 parameters=datetime_based_cursor_model.parameters or {}, 1579 ) 1580 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1581 1582 datetime_format = datetime_based_cursor_model.datetime_format 1583 1584 cursor_granularity = ( 1585 parse_duration(datetime_based_cursor_model.cursor_granularity) 1586 if datetime_based_cursor_model.cursor_granularity 1587 else None 1588 ) 1589 1590 connector_state_converter: DateTimeStreamStateConverter 1591 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1592 datetime_format=datetime_format, 1593 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1594 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1595 cursor_granularity=cursor_granularity, 1596 ) 1597 1598 # Create the cursor factory 1599 cursor_factory = ConcurrentCursorFactory( 1600 partial( 1601 self.create_concurrent_cursor_from_datetime_based_cursor, 1602 state_manager=state_manager, 1603 model_type=model_type, 1604 component_definition=component_definition, 1605 stream_name=stream_name, 1606 stream_namespace=stream_namespace, 1607 config=config, 1608 message_repository=NoopMessageRepository(), 1609 stream_state_migrations=stream_state_migrations, 1610 ) 1611 ) 1612 1613 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1614 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1615 use_global_cursor = isinstance( 1616 partition_router, GroupingPartitionRouter 1617 ) or component_definition.get("global_substream_cursor", False) 1618 1619 # Return the concurrent cursor and state converter 1620 return ConcurrentPerPartitionCursor( 1621 cursor_factory=cursor_factory, 1622 partition_router=partition_router, 1623 stream_name=stream_name, 1624 stream_namespace=stream_namespace, 1625 stream_state=stream_state, 1626 message_repository=self._message_repository, # type: ignore 1627 connector_state_manager=state_manager, 1628 connector_state_converter=connector_state_converter, 1629 cursor_field=cursor_field, 1630 use_global_cursor=use_global_cursor, 1631 ) 1632 1633 @staticmethod 1634 def create_constant_backoff_strategy( 1635 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1636 ) -> ConstantBackoffStrategy: 1637 return ConstantBackoffStrategy( 1638 backoff_time_in_seconds=model.backoff_time_in_seconds, 1639 config=config, 1640 parameters=model.parameters or {}, 1641 ) 1642 1643 def create_cursor_pagination( 1644 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1645 ) -> CursorPaginationStrategy: 1646 if isinstance(decoder, PaginationDecoderDecorator): 1647 inner_decoder = decoder.decoder 1648 else: 1649 inner_decoder = decoder 1650 decoder = PaginationDecoderDecorator(decoder=decoder) 1651 1652 if self._is_supported_decoder_for_pagination(inner_decoder): 1653 decoder_to_use = decoder 1654 else: 1655 raise ValueError( 1656 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1657 ) 1658 1659 return CursorPaginationStrategy( 1660 cursor_value=model.cursor_value, 1661 decoder=decoder_to_use, 1662 page_size=model.page_size, 1663 stop_condition=model.stop_condition, 1664 config=config, 1665 parameters=model.parameters or {}, 1666 ) 1667 1668 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1669 """ 1670 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1671 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1672 :param model: The Pydantic model of the custom component being created 1673 :param config: The custom defined connector config 1674 :return: The declarative component built from the Pydantic model to be used at runtime 1675 """ 1676 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1677 component_fields = get_type_hints(custom_component_class) 1678 model_args = model.dict() 1679 model_args["config"] = config 1680 1681 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1682 # we defer to these arguments over the component's definition 1683 for key, arg in kwargs.items(): 1684 model_args[key] = arg 1685 1686 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1687 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1688 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1689 for model_field, model_value in model_args.items(): 1690 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1691 if ( 1692 isinstance(model_value, dict) 1693 and "type" not in model_value 1694 and model_field in component_fields 1695 ): 1696 derived_type = self._derive_component_type_from_type_hints( 1697 component_fields.get(model_field) 1698 ) 1699 if derived_type: 1700 model_value["type"] = derived_type 1701 1702 if self._is_component(model_value): 1703 model_args[model_field] = self._create_nested_component( 1704 model, model_field, model_value, config 1705 ) 1706 elif isinstance(model_value, list): 1707 vals = [] 1708 for v in model_value: 1709 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1710 derived_type = self._derive_component_type_from_type_hints( 1711 component_fields.get(model_field) 1712 ) 1713 if derived_type: 1714 v["type"] = derived_type 1715 if self._is_component(v): 1716 vals.append(self._create_nested_component(model, model_field, v, config)) 1717 else: 1718 vals.append(v) 1719 model_args[model_field] = vals 1720 1721 kwargs = { 1722 class_field: model_args[class_field] 1723 for class_field in component_fields.keys() 1724 if class_field in model_args 1725 } 1726 return custom_component_class(**kwargs) 1727 1728 @staticmethod 1729 def _get_class_from_fully_qualified_class_name( 1730 full_qualified_class_name: str, 1731 ) -> Any: 1732 """Get a class from its fully qualified name. 1733 1734 If a custom components module is needed, we assume it is already registered - probably 1735 as `source_declarative_manifest.components` or `components`. 1736 1737 Args: 1738 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1739 1740 Returns: 1741 Any: The class object. 1742 1743 Raises: 1744 ValueError: If the class cannot be loaded. 1745 """ 1746 split = full_qualified_class_name.split(".") 1747 module_name_full = ".".join(split[:-1]) 1748 class_name = split[-1] 1749 1750 try: 1751 module_ref = importlib.import_module(module_name_full) 1752 except ModuleNotFoundError as e: 1753 if split[0] == "source_declarative_manifest": 1754 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1755 try: 1756 import os 1757 1758 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1759 module_ref = importlib.import_module( 1760 module_name_with_source_declarative_manifest 1761 ) 1762 except ModuleNotFoundError: 1763 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1764 else: 1765 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1766 1767 try: 1768 return getattr(module_ref, class_name) 1769 except AttributeError as e: 1770 raise ValueError( 1771 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1772 ) from e 1773 1774 @staticmethod 1775 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1776 interface = field_type 1777 while True: 1778 origin = get_origin(interface) 1779 if origin: 1780 # Unnest types until we reach the raw type 1781 # List[T] -> T 1782 # Optional[List[T]] -> T 1783 args = get_args(interface) 1784 interface = args[0] 1785 else: 1786 break 1787 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1788 return interface.__name__ 1789 return None 1790 1791 @staticmethod 1792 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1793 if not cls: 1794 return False 1795 return cls.__module__ == "builtins" 1796 1797 @staticmethod 1798 def _extract_missing_parameters(error: TypeError) -> List[str]: 1799 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1800 if parameter_search: 1801 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1802 else: 1803 return [] 1804 1805 def _create_nested_component( 1806 self, model: Any, model_field: str, model_value: Any, config: Config 1807 ) -> Any: 1808 type_name = model_value.get("type", None) 1809 if not type_name: 1810 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1811 return model_value 1812 1813 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1814 if model_type: 1815 parsed_model = model_type.parse_obj(model_value) 1816 try: 1817 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1818 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1819 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1820 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1821 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1822 # are needed by a component and could not be shared. 1823 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1824 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1825 model_parameters = model_value.get("$parameters", {}) 1826 matching_parameters = { 1827 kwarg: model_parameters[kwarg] 1828 for kwarg in constructor_kwargs 1829 if kwarg in model_parameters 1830 } 1831 return self._create_component_from_model( 1832 model=parsed_model, config=config, **matching_parameters 1833 ) 1834 except TypeError as error: 1835 missing_parameters = self._extract_missing_parameters(error) 1836 if missing_parameters: 1837 raise ValueError( 1838 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1839 + ", ".join( 1840 ( 1841 f"{type_name}.$parameters.{parameter}" 1842 for parameter in missing_parameters 1843 ) 1844 ) 1845 ) 1846 raise TypeError( 1847 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1848 ) 1849 else: 1850 raise ValueError( 1851 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1852 ) 1853 1854 @staticmethod 1855 def _is_component(model_value: Any) -> bool: 1856 return isinstance(model_value, dict) and model_value.get("type") is not None 1857 1858 def create_datetime_based_cursor( 1859 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1860 ) -> DatetimeBasedCursor: 1861 start_datetime: Union[str, MinMaxDatetime] = ( 1862 model.start_datetime 1863 if isinstance(model.start_datetime, str) 1864 else self.create_min_max_datetime(model.start_datetime, config) 1865 ) 1866 end_datetime: Union[str, MinMaxDatetime, None] = None 1867 if model.is_data_feed and model.end_datetime: 1868 raise ValueError("Data feed does not support end_datetime") 1869 if model.is_data_feed and model.is_client_side_incremental: 1870 raise ValueError( 1871 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1872 ) 1873 if model.end_datetime: 1874 end_datetime = ( 1875 model.end_datetime 1876 if isinstance(model.end_datetime, str) 1877 else self.create_min_max_datetime(model.end_datetime, config) 1878 ) 1879 1880 end_time_option = ( 1881 self._create_component_from_model( 1882 model.end_time_option, config, parameters=model.parameters or {} 1883 ) 1884 if model.end_time_option 1885 else None 1886 ) 1887 start_time_option = ( 1888 self._create_component_from_model( 1889 model.start_time_option, config, parameters=model.parameters or {} 1890 ) 1891 if model.start_time_option 1892 else None 1893 ) 1894 1895 return DatetimeBasedCursor( 1896 cursor_field=model.cursor_field, 1897 cursor_datetime_formats=model.cursor_datetime_formats 1898 if model.cursor_datetime_formats 1899 else [], 1900 cursor_granularity=model.cursor_granularity, 1901 datetime_format=model.datetime_format, 1902 end_datetime=end_datetime, 1903 start_datetime=start_datetime, 1904 step=model.step, 1905 end_time_option=end_time_option, 1906 lookback_window=model.lookback_window, 1907 start_time_option=start_time_option, 1908 partition_field_end=model.partition_field_end, 1909 partition_field_start=model.partition_field_start, 1910 message_repository=self._message_repository, 1911 is_compare_strictly=model.is_compare_strictly, 1912 config=config, 1913 parameters=model.parameters or {}, 1914 ) 1915 1916 def create_declarative_stream( 1917 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1918 ) -> DeclarativeStream: 1919 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1920 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1921 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1922 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1923 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1924 1925 primary_key = model.primary_key.__root__ if model.primary_key else None 1926 stop_condition_on_cursor = ( 1927 model.incremental_sync 1928 and hasattr(model.incremental_sync, "is_data_feed") 1929 and model.incremental_sync.is_data_feed 1930 ) 1931 client_side_incremental_sync = None 1932 if ( 1933 model.incremental_sync 1934 and hasattr(model.incremental_sync, "is_client_side_incremental") 1935 and model.incremental_sync.is_client_side_incremental 1936 ): 1937 supported_slicers = ( 1938 DatetimeBasedCursor, 1939 GlobalSubstreamCursor, 1940 PerPartitionWithGlobalCursor, 1941 ) 1942 if combined_slicers and not isinstance(combined_slicers, supported_slicers): 1943 raise ValueError( 1944 "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead" 1945 ) 1946 cursor = ( 1947 combined_slicers 1948 if isinstance( 1949 combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor) 1950 ) 1951 else self._create_component_from_model(model=model.incremental_sync, config=config) 1952 ) 1953 1954 client_side_incremental_sync = {"cursor": cursor} 1955 1956 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1957 cursor_model = model.incremental_sync 1958 1959 end_time_option = ( 1960 self._create_component_from_model( 1961 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1962 ) 1963 if cursor_model.end_time_option 1964 else None 1965 ) 1966 start_time_option = ( 1967 self._create_component_from_model( 1968 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1969 ) 1970 if cursor_model.start_time_option 1971 else None 1972 ) 1973 1974 request_options_provider = DatetimeBasedRequestOptionsProvider( 1975 start_time_option=start_time_option, 1976 end_time_option=end_time_option, 1977 partition_field_start=cursor_model.partition_field_end, 1978 partition_field_end=cursor_model.partition_field_end, 1979 config=config, 1980 parameters=model.parameters or {}, 1981 ) 1982 elif model.incremental_sync and isinstance( 1983 model.incremental_sync, IncrementingCountCursorModel 1984 ): 1985 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1986 1987 start_time_option = ( 1988 self._create_component_from_model( 1989 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1990 config, 1991 parameters=cursor_model.parameters or {}, 1992 ) 1993 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1994 else None 1995 ) 1996 1997 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1998 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1999 partition_field_start = "start" 2000 2001 request_options_provider = DatetimeBasedRequestOptionsProvider( 2002 start_time_option=start_time_option, 2003 partition_field_start=partition_field_start, 2004 config=config, 2005 parameters=model.parameters or {}, 2006 ) 2007 else: 2008 request_options_provider = None 2009 2010 transformations = [] 2011 if model.transformations: 2012 for transformation_model in model.transformations: 2013 transformations.append( 2014 self._create_component_from_model(model=transformation_model, config=config) 2015 ) 2016 file_uploader = None 2017 if model.file_uploader: 2018 file_uploader = self._create_component_from_model( 2019 model=model.file_uploader, config=config 2020 ) 2021 2022 retriever = self._create_component_from_model( 2023 model=model.retriever, 2024 config=config, 2025 name=model.name, 2026 primary_key=primary_key, 2027 stream_slicer=combined_slicers, 2028 request_options_provider=request_options_provider, 2029 stop_condition_on_cursor=stop_condition_on_cursor, 2030 client_side_incremental_sync=client_side_incremental_sync, 2031 transformations=transformations, 2032 file_uploader=file_uploader, 2033 incremental_sync=model.incremental_sync, 2034 ) 2035 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 2036 2037 if model.state_migrations: 2038 state_transformations = [ 2039 self._create_component_from_model(state_migration, config, declarative_stream=model) 2040 for state_migration in model.state_migrations 2041 ] 2042 else: 2043 state_transformations = [] 2044 2045 schema_loader: Union[ 2046 CompositeSchemaLoader, 2047 DefaultSchemaLoader, 2048 DynamicSchemaLoader, 2049 InlineSchemaLoader, 2050 JsonFileSchemaLoader, 2051 ] 2052 if model.schema_loader and isinstance(model.schema_loader, list): 2053 nested_schema_loaders = [ 2054 self._create_component_from_model(model=nested_schema_loader, config=config) 2055 for nested_schema_loader in model.schema_loader 2056 ] 2057 schema_loader = CompositeSchemaLoader( 2058 schema_loaders=nested_schema_loaders, parameters={} 2059 ) 2060 elif model.schema_loader: 2061 schema_loader = self._create_component_from_model( 2062 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2063 config=config, 2064 ) 2065 else: 2066 options = model.parameters or {} 2067 if "name" not in options: 2068 options["name"] = model.name 2069 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2070 2071 return DeclarativeStream( 2072 name=model.name or "", 2073 primary_key=primary_key, 2074 retriever=retriever, 2075 schema_loader=schema_loader, 2076 stream_cursor_field=cursor_field or "", 2077 state_migrations=state_transformations, 2078 config=config, 2079 parameters=model.parameters or {}, 2080 ) 2081 2082 def _build_stream_slicer_from_partition_router( 2083 self, 2084 model: Union[ 2085 AsyncRetrieverModel, 2086 CustomRetrieverModel, 2087 SimpleRetrieverModel, 2088 ], 2089 config: Config, 2090 stream_name: Optional[str] = None, 2091 ) -> Optional[PartitionRouter]: 2092 if ( 2093 hasattr(model, "partition_router") 2094 and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel) 2095 and model.partition_router 2096 ): 2097 stream_slicer_model = model.partition_router 2098 if isinstance(stream_slicer_model, list): 2099 return CartesianProductStreamSlicer( 2100 [ 2101 self._create_component_from_model( 2102 model=slicer, config=config, stream_name=stream_name or "" 2103 ) 2104 for slicer in stream_slicer_model 2105 ], 2106 parameters={}, 2107 ) 2108 else: 2109 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2110 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2111 ) 2112 return None 2113 2114 def _build_incremental_cursor( 2115 self, 2116 model: DeclarativeStreamModel, 2117 stream_slicer: Optional[PartitionRouter], 2118 config: Config, 2119 ) -> Optional[StreamSlicer]: 2120 if model.incremental_sync and stream_slicer: 2121 if model.retriever.type == "AsyncRetriever": 2122 stream_name = model.name or "" 2123 stream_namespace = None 2124 stream_state = self._connector_state_manager.get_stream_state( 2125 stream_name, stream_namespace 2126 ) 2127 state_transformations = ( 2128 [ 2129 self._create_component_from_model( 2130 state_migration, config, declarative_stream=model 2131 ) 2132 for state_migration in model.state_migrations 2133 ] 2134 if model.state_migrations 2135 else [] 2136 ) 2137 2138 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2139 state_manager=self._connector_state_manager, 2140 model_type=DatetimeBasedCursorModel, 2141 component_definition=model.incremental_sync.__dict__, 2142 stream_name=stream_name, 2143 stream_namespace=stream_namespace, 2144 config=config or {}, 2145 stream_state=stream_state, 2146 stream_state_migrations=state_transformations, 2147 partition_router=stream_slicer, 2148 ) 2149 2150 incremental_sync_model = model.incremental_sync 2151 cursor_component = self._create_component_from_model( 2152 model=incremental_sync_model, config=config 2153 ) 2154 is_global_cursor = ( 2155 hasattr(incremental_sync_model, "global_substream_cursor") 2156 and incremental_sync_model.global_substream_cursor 2157 ) 2158 2159 if is_global_cursor: 2160 return GlobalSubstreamCursor( 2161 stream_cursor=cursor_component, partition_router=stream_slicer 2162 ) 2163 return PerPartitionWithGlobalCursor( 2164 cursor_factory=CursorFactory( 2165 lambda: self._create_component_from_model( 2166 model=incremental_sync_model, config=config 2167 ), 2168 ), 2169 partition_router=stream_slicer, 2170 stream_cursor=cursor_component, 2171 ) 2172 elif model.incremental_sync: 2173 if model.retriever.type == "AsyncRetriever": 2174 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2175 model_type=DatetimeBasedCursorModel, 2176 component_definition=model.incremental_sync.__dict__, 2177 stream_name=model.name or "", 2178 stream_namespace=None, 2179 config=config or {}, 2180 stream_state_migrations=model.state_migrations, 2181 ) 2182 return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync 2183 return None 2184 2185 def _build_resumable_cursor( 2186 self, 2187 model: Union[ 2188 AsyncRetrieverModel, 2189 CustomRetrieverModel, 2190 SimpleRetrieverModel, 2191 ], 2192 stream_slicer: Optional[PartitionRouter], 2193 ) -> Optional[StreamSlicer]: 2194 if hasattr(model, "paginator") and model.paginator and not stream_slicer: 2195 # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` 2196 return ResumableFullRefreshCursor(parameters={}) 2197 elif stream_slicer: 2198 # For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor` 2199 return PerPartitionCursor( 2200 cursor_factory=CursorFactory( 2201 create_function=partial(ChildPartitionResumableFullRefreshCursor, {}) 2202 ), 2203 partition_router=stream_slicer, 2204 ) 2205 return None 2206 2207 def _merge_stream_slicers( 2208 self, model: DeclarativeStreamModel, config: Config 2209 ) -> Optional[StreamSlicer]: 2210 retriever_model = model.retriever 2211 2212 stream_slicer = self._build_stream_slicer_from_partition_router( 2213 retriever_model, config, stream_name=model.name 2214 ) 2215 2216 if retriever_model.type == "AsyncRetriever": 2217 is_not_datetime_cursor = ( 2218 model.incremental_sync.type != "DatetimeBasedCursor" 2219 if model.incremental_sync 2220 else None 2221 ) 2222 is_partition_router = ( 2223 bool(retriever_model.partition_router) if model.incremental_sync else None 2224 ) 2225 2226 if is_not_datetime_cursor: 2227 # We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the 2228 # support or unordered slices (for example, when we trigger reports for January and February, the report 2229 # in February can be completed first). Once we have support for custom concurrent cursor or have a new 2230 # implementation available in the CDK, we can enable more cursors here. 2231 raise ValueError( 2232 "AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet." 2233 ) 2234 2235 if is_partition_router and not stream_slicer: 2236 # Note that this development is also done in parallel to the per partition development which once merged 2237 # we could support here by calling create_concurrent_cursor_from_perpartition_cursor 2238 raise ValueError("Per partition state is not supported yet for AsyncRetriever.") 2239 2240 if model.incremental_sync: 2241 return self._build_incremental_cursor(model, stream_slicer, config) 2242 2243 return ( 2244 stream_slicer 2245 if self._disable_resumable_full_refresh 2246 else self._build_resumable_cursor(retriever_model, stream_slicer) 2247 ) 2248 2249 def create_default_error_handler( 2250 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2251 ) -> DefaultErrorHandler: 2252 backoff_strategies = [] 2253 if model.backoff_strategies: 2254 for backoff_strategy_model in model.backoff_strategies: 2255 backoff_strategies.append( 2256 self._create_component_from_model(model=backoff_strategy_model, config=config) 2257 ) 2258 2259 response_filters = [] 2260 if model.response_filters: 2261 for response_filter_model in model.response_filters: 2262 response_filters.append( 2263 self._create_component_from_model(model=response_filter_model, config=config) 2264 ) 2265 response_filters.append( 2266 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2267 ) 2268 2269 return DefaultErrorHandler( 2270 backoff_strategies=backoff_strategies, 2271 max_retries=model.max_retries, 2272 response_filters=response_filters, 2273 config=config, 2274 parameters=model.parameters or {}, 2275 ) 2276 2277 def create_default_paginator( 2278 self, 2279 model: DefaultPaginatorModel, 2280 config: Config, 2281 *, 2282 url_base: str, 2283 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2284 decoder: Optional[Decoder] = None, 2285 cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None, 2286 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2287 if decoder: 2288 if self._is_supported_decoder_for_pagination(decoder): 2289 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2290 else: 2291 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2292 else: 2293 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2294 page_size_option = ( 2295 self._create_component_from_model(model=model.page_size_option, config=config) 2296 if model.page_size_option 2297 else None 2298 ) 2299 page_token_option = ( 2300 self._create_component_from_model(model=model.page_token_option, config=config) 2301 if model.page_token_option 2302 else None 2303 ) 2304 pagination_strategy = self._create_component_from_model( 2305 model=model.pagination_strategy, 2306 config=config, 2307 decoder=decoder_to_use, 2308 extractor_model=extractor_model, 2309 ) 2310 if cursor_used_for_stop_condition: 2311 pagination_strategy = StopConditionPaginationStrategyDecorator( 2312 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2313 ) 2314 paginator = DefaultPaginator( 2315 decoder=decoder_to_use, 2316 page_size_option=page_size_option, 2317 page_token_option=page_token_option, 2318 pagination_strategy=pagination_strategy, 2319 url_base=url_base, 2320 config=config, 2321 parameters=model.parameters or {}, 2322 ) 2323 if self._limit_pages_fetched_per_slice: 2324 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2325 return paginator 2326 2327 def create_dpath_extractor( 2328 self, 2329 model: DpathExtractorModel, 2330 config: Config, 2331 decoder: Optional[Decoder] = None, 2332 **kwargs: Any, 2333 ) -> DpathExtractor: 2334 if decoder: 2335 decoder_to_use = decoder 2336 else: 2337 decoder_to_use = JsonDecoder(parameters={}) 2338 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2339 return DpathExtractor( 2340 decoder=decoder_to_use, 2341 field_path=model_field_path, 2342 config=config, 2343 parameters=model.parameters or {}, 2344 ) 2345 2346 @staticmethod 2347 def create_response_to_file_extractor( 2348 model: ResponseToFileExtractorModel, 2349 **kwargs: Any, 2350 ) -> ResponseToFileExtractor: 2351 return ResponseToFileExtractor(parameters=model.parameters or {}) 2352 2353 @staticmethod 2354 def create_exponential_backoff_strategy( 2355 model: ExponentialBackoffStrategyModel, config: Config 2356 ) -> ExponentialBackoffStrategy: 2357 return ExponentialBackoffStrategy( 2358 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2359 ) 2360 2361 @staticmethod 2362 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2363 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2364 2365 def create_http_requester( 2366 self, 2367 model: HttpRequesterModel, 2368 config: Config, 2369 decoder: Decoder = JsonDecoder(parameters={}), 2370 query_properties_key: Optional[str] = None, 2371 use_cache: Optional[bool] = None, 2372 *, 2373 name: str, 2374 ) -> HttpRequester: 2375 authenticator = ( 2376 self._create_component_from_model( 2377 model=model.authenticator, 2378 config=config, 2379 url_base=model.url or model.url_base, 2380 name=name, 2381 decoder=decoder, 2382 ) 2383 if model.authenticator 2384 else None 2385 ) 2386 error_handler = ( 2387 self._create_component_from_model(model=model.error_handler, config=config) 2388 if model.error_handler 2389 else DefaultErrorHandler( 2390 backoff_strategies=[], 2391 response_filters=[], 2392 config=config, 2393 parameters=model.parameters or {}, 2394 ) 2395 ) 2396 2397 api_budget = self._api_budget 2398 2399 # Removes QueryProperties components from the interpolated mappings because it has been designed 2400 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2401 # instead of through jinja interpolation 2402 request_parameters: Optional[Union[str, Mapping[str, str]]] 2403 if isinstance(model.request_parameters, Mapping): 2404 request_parameters = self._remove_query_properties(model.request_parameters) 2405 else: 2406 request_parameters = model.request_parameters 2407 2408 request_options_provider = InterpolatedRequestOptionsProvider( 2409 request_body=model.request_body, 2410 request_body_data=model.request_body_data, 2411 request_body_json=model.request_body_json, 2412 request_headers=model.request_headers, 2413 request_parameters=request_parameters, 2414 query_properties_key=query_properties_key, 2415 config=config, 2416 parameters=model.parameters or {}, 2417 ) 2418 2419 assert model.use_cache is not None # for mypy 2420 assert model.http_method is not None # for mypy 2421 2422 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2423 2424 return HttpRequester( 2425 name=name, 2426 url=model.url, 2427 url_base=model.url_base, 2428 path=model.path, 2429 authenticator=authenticator, 2430 error_handler=error_handler, 2431 api_budget=api_budget, 2432 http_method=HttpMethod[model.http_method.value], 2433 request_options_provider=request_options_provider, 2434 config=config, 2435 disable_retries=self._disable_retries, 2436 parameters=model.parameters or {}, 2437 message_repository=self._message_repository, 2438 use_cache=should_use_cache, 2439 decoder=decoder, 2440 stream_response=decoder.is_stream_response() if decoder else False, 2441 ) 2442 2443 @staticmethod 2444 def create_http_response_filter( 2445 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2446 ) -> HttpResponseFilter: 2447 if model.action: 2448 action = ResponseAction(model.action.value) 2449 else: 2450 action = None 2451 2452 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2453 2454 http_codes = ( 2455 set(model.http_codes) if model.http_codes else set() 2456 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2457 2458 return HttpResponseFilter( 2459 action=action, 2460 failure_type=failure_type, 2461 error_message=model.error_message or "", 2462 error_message_contains=model.error_message_contains or "", 2463 http_codes=http_codes, 2464 predicate=model.predicate or "", 2465 config=config, 2466 parameters=model.parameters or {}, 2467 ) 2468 2469 @staticmethod 2470 def create_inline_schema_loader( 2471 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2472 ) -> InlineSchemaLoader: 2473 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2474 2475 def create_complex_field_type( 2476 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2477 ) -> ComplexFieldType: 2478 items = ( 2479 self._create_component_from_model(model=model.items, config=config) 2480 if isinstance(model.items, ComplexFieldTypeModel) 2481 else model.items 2482 ) 2483 2484 return ComplexFieldType(field_type=model.field_type, items=items) 2485 2486 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2487 target_type = ( 2488 self._create_component_from_model(model=model.target_type, config=config) 2489 if isinstance(model.target_type, ComplexFieldTypeModel) 2490 else model.target_type 2491 ) 2492 2493 return TypesMap( 2494 target_type=target_type, 2495 current_type=model.current_type, 2496 condition=model.condition if model.condition is not None else "True", 2497 ) 2498 2499 def create_schema_type_identifier( 2500 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2501 ) -> SchemaTypeIdentifier: 2502 types_mapping = [] 2503 if model.types_mapping: 2504 types_mapping.extend( 2505 [ 2506 self._create_component_from_model(types_map, config=config) 2507 for types_map in model.types_mapping 2508 ] 2509 ) 2510 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2511 [x for x in model.schema_pointer] if model.schema_pointer else [] 2512 ) 2513 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2514 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2515 [x for x in model.type_pointer] if model.type_pointer else None 2516 ) 2517 2518 return SchemaTypeIdentifier( 2519 schema_pointer=model_schema_pointer, 2520 key_pointer=model_key_pointer, 2521 type_pointer=model_type_pointer, 2522 types_mapping=types_mapping, 2523 parameters=model.parameters or {}, 2524 ) 2525 2526 def create_dynamic_schema_loader( 2527 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2528 ) -> DynamicSchemaLoader: 2529 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2530 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2531 2532 schema_transformations = [] 2533 if model.schema_transformations: 2534 for transformation_model in model.schema_transformations: 2535 schema_transformations.append( 2536 self._create_component_from_model(model=transformation_model, config=config) 2537 ) 2538 name = "dynamic_properties" 2539 retriever = self._create_component_from_model( 2540 model=model.retriever, 2541 config=config, 2542 name=name, 2543 primary_key=None, 2544 stream_slicer=combined_slicers, 2545 transformations=[], 2546 use_cache=True, 2547 log_formatter=( 2548 lambda response: format_http_message( 2549 response, 2550 f"Schema loader '{name}' request", 2551 f"Request performed in order to extract schema.", 2552 name, 2553 is_auxiliary=True, 2554 ) 2555 ), 2556 ) 2557 schema_type_identifier = self._create_component_from_model( 2558 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2559 ) 2560 schema_filter = ( 2561 self._create_component_from_model( 2562 model.schema_filter, config=config, parameters=model.parameters or {} 2563 ) 2564 if model.schema_filter is not None 2565 else None 2566 ) 2567 2568 return DynamicSchemaLoader( 2569 retriever=retriever, 2570 config=config, 2571 schema_transformations=schema_transformations, 2572 schema_filter=schema_filter, 2573 schema_type_identifier=schema_type_identifier, 2574 parameters=model.parameters or {}, 2575 ) 2576 2577 @staticmethod 2578 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2579 return JsonDecoder(parameters={}) 2580 2581 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2582 return CompositeRawDecoder( 2583 parser=ModelToComponentFactory._get_parser(model, config), 2584 stream_response=False if self._emit_connector_builder_messages else True, 2585 ) 2586 2587 def create_jsonl_decoder( 2588 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2589 ) -> Decoder: 2590 return CompositeRawDecoder( 2591 parser=ModelToComponentFactory._get_parser(model, config), 2592 stream_response=False if self._emit_connector_builder_messages else True, 2593 ) 2594 2595 def create_gzip_decoder( 2596 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2597 ) -> Decoder: 2598 _compressed_response_types = { 2599 "gzip", 2600 "x-gzip", 2601 "gzip, deflate", 2602 "x-gzip, deflate", 2603 "application/zip", 2604 "application/gzip", 2605 "application/x-gzip", 2606 "application/x-zip-compressed", 2607 } 2608 2609 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2610 2611 if self._emit_connector_builder_messages: 2612 # This is very surprising but if the response is not streamed, 2613 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2614 # which uses urllib3 directly and does not uncompress the data. 2615 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2616 2617 return CompositeRawDecoder.by_headers( 2618 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2619 stream_response=True, 2620 fallback_parser=gzip_parser.inner_parser, 2621 ) 2622 2623 @staticmethod 2624 def create_incrementing_count_cursor( 2625 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2626 ) -> DatetimeBasedCursor: 2627 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2628 # we still parse models into components. The issue is that there's no runtime implementation of a 2629 # IncrementingCountCursor. 2630 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2631 return DatetimeBasedCursor( 2632 cursor_field=model.cursor_field, 2633 datetime_format="%Y-%m-%d", 2634 start_datetime="2024-12-12", 2635 config=config, 2636 parameters={}, 2637 ) 2638 2639 @staticmethod 2640 def create_iterable_decoder( 2641 model: IterableDecoderModel, config: Config, **kwargs: Any 2642 ) -> IterableDecoder: 2643 return IterableDecoder(parameters={}) 2644 2645 @staticmethod 2646 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2647 return XmlDecoder(parameters={}) 2648 2649 def create_zipfile_decoder( 2650 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2651 ) -> ZipfileDecoder: 2652 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2653 2654 @staticmethod 2655 def _get_parser(model: BaseModel, config: Config) -> Parser: 2656 if isinstance(model, JsonDecoderModel): 2657 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2658 return JsonParser() 2659 elif isinstance(model, JsonlDecoderModel): 2660 return JsonLineParser() 2661 elif isinstance(model, CsvDecoderModel): 2662 return CsvParser( 2663 encoding=model.encoding, 2664 delimiter=model.delimiter, 2665 set_values_to_none=model.set_values_to_none, 2666 ) 2667 elif isinstance(model, GzipDecoderModel): 2668 return GzipParser( 2669 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2670 ) 2671 elif isinstance( 2672 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2673 ): 2674 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2675 2676 raise ValueError(f"Unknown decoder type {model}") 2677 2678 @staticmethod 2679 def create_json_file_schema_loader( 2680 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2681 ) -> JsonFileSchemaLoader: 2682 return JsonFileSchemaLoader( 2683 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2684 ) 2685 2686 @staticmethod 2687 def create_jwt_authenticator( 2688 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2689 ) -> JwtAuthenticator: 2690 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2691 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2692 return JwtAuthenticator( 2693 config=config, 2694 parameters=model.parameters or {}, 2695 algorithm=JwtAlgorithm(model.algorithm.value), 2696 secret_key=model.secret_key, 2697 base64_encode_secret_key=model.base64_encode_secret_key, 2698 token_duration=model.token_duration, 2699 header_prefix=model.header_prefix, 2700 kid=jwt_headers.kid, 2701 typ=jwt_headers.typ, 2702 cty=jwt_headers.cty, 2703 iss=jwt_payload.iss, 2704 sub=jwt_payload.sub, 2705 aud=jwt_payload.aud, 2706 additional_jwt_headers=model.additional_jwt_headers, 2707 additional_jwt_payload=model.additional_jwt_payload, 2708 ) 2709 2710 def create_list_partition_router( 2711 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2712 ) -> ListPartitionRouter: 2713 request_option = ( 2714 self._create_component_from_model(model.request_option, config) 2715 if model.request_option 2716 else None 2717 ) 2718 return ListPartitionRouter( 2719 cursor_field=model.cursor_field, 2720 request_option=request_option, 2721 values=model.values, 2722 config=config, 2723 parameters=model.parameters or {}, 2724 ) 2725 2726 @staticmethod 2727 def create_min_max_datetime( 2728 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2729 ) -> MinMaxDatetime: 2730 return MinMaxDatetime( 2731 datetime=model.datetime, 2732 datetime_format=model.datetime_format or "", 2733 max_datetime=model.max_datetime or "", 2734 min_datetime=model.min_datetime or "", 2735 parameters=model.parameters or {}, 2736 ) 2737 2738 @staticmethod 2739 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2740 return NoAuth(parameters=model.parameters or {}) 2741 2742 @staticmethod 2743 def create_no_pagination( 2744 model: NoPaginationModel, config: Config, **kwargs: Any 2745 ) -> NoPagination: 2746 return NoPagination(parameters={}) 2747 2748 def create_oauth_authenticator( 2749 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2750 ) -> DeclarativeOauth2Authenticator: 2751 profile_assertion = ( 2752 self._create_component_from_model(model.profile_assertion, config=config) 2753 if model.profile_assertion 2754 else None 2755 ) 2756 2757 if model.refresh_token_updater: 2758 # ignore type error because fixing it would have a lot of dependencies, revisit later 2759 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2760 config, 2761 InterpolatedString.create( 2762 model.token_refresh_endpoint, # type: ignore 2763 parameters=model.parameters or {}, 2764 ).eval(config), 2765 access_token_name=InterpolatedString.create( 2766 model.access_token_name or "access_token", parameters=model.parameters or {} 2767 ).eval(config), 2768 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2769 expires_in_name=InterpolatedString.create( 2770 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2771 ).eval(config), 2772 client_id_name=InterpolatedString.create( 2773 model.client_id_name or "client_id", parameters=model.parameters or {} 2774 ).eval(config), 2775 client_id=InterpolatedString.create( 2776 model.client_id, parameters=model.parameters or {} 2777 ).eval(config) 2778 if model.client_id 2779 else model.client_id, 2780 client_secret_name=InterpolatedString.create( 2781 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2782 ).eval(config), 2783 client_secret=InterpolatedString.create( 2784 model.client_secret, parameters=model.parameters or {} 2785 ).eval(config) 2786 if model.client_secret 2787 else model.client_secret, 2788 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2789 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2790 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2791 grant_type_name=InterpolatedString.create( 2792 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2793 ).eval(config), 2794 grant_type=InterpolatedString.create( 2795 model.grant_type or "refresh_token", parameters=model.parameters or {} 2796 ).eval(config), 2797 refresh_request_body=InterpolatedMapping( 2798 model.refresh_request_body or {}, parameters=model.parameters or {} 2799 ).eval(config), 2800 refresh_request_headers=InterpolatedMapping( 2801 model.refresh_request_headers or {}, parameters=model.parameters or {} 2802 ).eval(config), 2803 scopes=model.scopes, 2804 token_expiry_date_format=model.token_expiry_date_format, 2805 message_repository=self._message_repository, 2806 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2807 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2808 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2809 ) 2810 # ignore type error because fixing it would have a lot of dependencies, revisit later 2811 return DeclarativeOauth2Authenticator( # type: ignore 2812 access_token_name=model.access_token_name or "access_token", 2813 access_token_value=model.access_token_value, 2814 client_id_name=model.client_id_name or "client_id", 2815 client_id=model.client_id, 2816 client_secret_name=model.client_secret_name or "client_secret", 2817 client_secret=model.client_secret, 2818 expires_in_name=model.expires_in_name or "expires_in", 2819 grant_type_name=model.grant_type_name or "grant_type", 2820 grant_type=model.grant_type or "refresh_token", 2821 refresh_request_body=model.refresh_request_body, 2822 refresh_request_headers=model.refresh_request_headers, 2823 refresh_token_name=model.refresh_token_name or "refresh_token", 2824 refresh_token=model.refresh_token, 2825 scopes=model.scopes, 2826 token_expiry_date=model.token_expiry_date, 2827 token_expiry_date_format=model.token_expiry_date_format, 2828 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2829 token_refresh_endpoint=model.token_refresh_endpoint, 2830 config=config, 2831 parameters=model.parameters or {}, 2832 message_repository=self._message_repository, 2833 profile_assertion=profile_assertion, 2834 use_profile_assertion=model.use_profile_assertion, 2835 ) 2836 2837 def create_offset_increment( 2838 self, 2839 model: OffsetIncrementModel, 2840 config: Config, 2841 decoder: Decoder, 2842 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2843 **kwargs: Any, 2844 ) -> OffsetIncrement: 2845 if isinstance(decoder, PaginationDecoderDecorator): 2846 inner_decoder = decoder.decoder 2847 else: 2848 inner_decoder = decoder 2849 decoder = PaginationDecoderDecorator(decoder=decoder) 2850 2851 if self._is_supported_decoder_for_pagination(inner_decoder): 2852 decoder_to_use = decoder 2853 else: 2854 raise ValueError( 2855 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2856 ) 2857 2858 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2859 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2860 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2861 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2862 # When we have more time to investigate we can look into reusing the same component. 2863 extractor = ( 2864 self._create_component_from_model( 2865 model=extractor_model, config=config, decoder=decoder_to_use 2866 ) 2867 if extractor_model 2868 else None 2869 ) 2870 2871 return OffsetIncrement( 2872 page_size=model.page_size, 2873 config=config, 2874 decoder=decoder_to_use, 2875 extractor=extractor, 2876 inject_on_first_request=model.inject_on_first_request or False, 2877 parameters=model.parameters or {}, 2878 ) 2879 2880 @staticmethod 2881 def create_page_increment( 2882 model: PageIncrementModel, config: Config, **kwargs: Any 2883 ) -> PageIncrement: 2884 return PageIncrement( 2885 page_size=model.page_size, 2886 config=config, 2887 start_from_page=model.start_from_page or 0, 2888 inject_on_first_request=model.inject_on_first_request or False, 2889 parameters=model.parameters or {}, 2890 ) 2891 2892 def create_parent_stream_config( 2893 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2894 ) -> ParentStreamConfig: 2895 declarative_stream = self._create_component_from_model( 2896 model.stream, config=config, **kwargs 2897 ) 2898 request_option = ( 2899 self._create_component_from_model(model.request_option, config=config) 2900 if model.request_option 2901 else None 2902 ) 2903 2904 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2905 raise ValueError( 2906 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2907 ) 2908 2909 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2910 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2911 ) 2912 2913 return ParentStreamConfig( 2914 parent_key=model.parent_key, 2915 request_option=request_option, 2916 stream=declarative_stream, 2917 partition_field=model.partition_field, 2918 config=config, 2919 incremental_dependency=model.incremental_dependency or False, 2920 parameters=model.parameters or {}, 2921 extra_fields=model.extra_fields, 2922 lazy_read_pointer=model_lazy_read_pointer, 2923 ) 2924 2925 def create_properties_from_endpoint( 2926 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2927 ) -> PropertiesFromEndpoint: 2928 retriever = self._create_component_from_model( 2929 model=model.retriever, 2930 config=config, 2931 name="dynamic_properties", 2932 primary_key=None, 2933 stream_slicer=None, 2934 transformations=[], 2935 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2936 ) 2937 return PropertiesFromEndpoint( 2938 property_field_path=model.property_field_path, 2939 retriever=retriever, 2940 config=config, 2941 parameters=model.parameters or {}, 2942 ) 2943 2944 def create_property_chunking( 2945 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2946 ) -> PropertyChunking: 2947 record_merge_strategy = ( 2948 self._create_component_from_model( 2949 model=model.record_merge_strategy, config=config, **kwargs 2950 ) 2951 if model.record_merge_strategy 2952 else None 2953 ) 2954 2955 property_limit_type: PropertyLimitType 2956 match model.property_limit_type: 2957 case PropertyLimitTypeModel.property_count: 2958 property_limit_type = PropertyLimitType.property_count 2959 case PropertyLimitTypeModel.characters: 2960 property_limit_type = PropertyLimitType.characters 2961 case _: 2962 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2963 2964 return PropertyChunking( 2965 property_limit_type=property_limit_type, 2966 property_limit=model.property_limit, 2967 record_merge_strategy=record_merge_strategy, 2968 config=config, 2969 parameters=model.parameters or {}, 2970 ) 2971 2972 def create_query_properties( 2973 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2974 ) -> QueryProperties: 2975 if isinstance(model.property_list, list): 2976 property_list = model.property_list 2977 else: 2978 property_list = self._create_component_from_model( 2979 model=model.property_list, config=config, **kwargs 2980 ) 2981 2982 property_chunking = ( 2983 self._create_component_from_model( 2984 model=model.property_chunking, config=config, **kwargs 2985 ) 2986 if model.property_chunking 2987 else None 2988 ) 2989 2990 return QueryProperties( 2991 property_list=property_list, 2992 always_include_properties=model.always_include_properties, 2993 property_chunking=property_chunking, 2994 config=config, 2995 parameters=model.parameters or {}, 2996 ) 2997 2998 @staticmethod 2999 def create_record_filter( 3000 model: RecordFilterModel, config: Config, **kwargs: Any 3001 ) -> RecordFilter: 3002 return RecordFilter( 3003 condition=model.condition or "", config=config, parameters=model.parameters or {} 3004 ) 3005 3006 @staticmethod 3007 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3008 return RequestPath(parameters={}) 3009 3010 @staticmethod 3011 def create_request_option( 3012 model: RequestOptionModel, config: Config, **kwargs: Any 3013 ) -> RequestOption: 3014 inject_into = RequestOptionType(model.inject_into.value) 3015 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3016 [ 3017 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3018 for segment in model.field_path 3019 ] 3020 if model.field_path 3021 else None 3022 ) 3023 field_name = ( 3024 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3025 if model.field_name 3026 else None 3027 ) 3028 return RequestOption( 3029 field_name=field_name, 3030 field_path=field_path, 3031 inject_into=inject_into, 3032 parameters=kwargs.get("parameters", {}), 3033 ) 3034 3035 def create_record_selector( 3036 self, 3037 model: RecordSelectorModel, 3038 config: Config, 3039 *, 3040 name: str, 3041 transformations: List[RecordTransformation] | None = None, 3042 decoder: Decoder | None = None, 3043 client_side_incremental_sync: Dict[str, Any] | None = None, 3044 file_uploader: Optional[DefaultFileUploader] = None, 3045 **kwargs: Any, 3046 ) -> RecordSelector: 3047 extractor = self._create_component_from_model( 3048 model=model.extractor, decoder=decoder, config=config 3049 ) 3050 record_filter = ( 3051 self._create_component_from_model(model.record_filter, config=config) 3052 if model.record_filter 3053 else None 3054 ) 3055 3056 transform_before_filtering = ( 3057 False if model.transform_before_filtering is None else model.transform_before_filtering 3058 ) 3059 if client_side_incremental_sync: 3060 record_filter = ClientSideIncrementalRecordFilterDecorator( 3061 config=config, 3062 parameters=model.parameters, 3063 condition=model.record_filter.condition 3064 if (model.record_filter and hasattr(model.record_filter, "condition")) 3065 else None, 3066 **client_side_incremental_sync, 3067 ) 3068 transform_before_filtering = ( 3069 True 3070 if model.transform_before_filtering is None 3071 else model.transform_before_filtering 3072 ) 3073 3074 if model.schema_normalization is None: 3075 # default to no schema normalization if not set 3076 model.schema_normalization = SchemaNormalizationModel.None_ 3077 3078 schema_normalization = ( 3079 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3080 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3081 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3082 ) 3083 3084 return RecordSelector( 3085 extractor=extractor, 3086 name=name, 3087 config=config, 3088 record_filter=record_filter, 3089 transformations=transformations or [], 3090 file_uploader=file_uploader, 3091 schema_normalization=schema_normalization, 3092 parameters=model.parameters or {}, 3093 transform_before_filtering=transform_before_filtering, 3094 ) 3095 3096 @staticmethod 3097 def create_remove_fields( 3098 model: RemoveFieldsModel, config: Config, **kwargs: Any 3099 ) -> RemoveFields: 3100 return RemoveFields( 3101 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3102 ) 3103 3104 def create_selective_authenticator( 3105 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3106 ) -> DeclarativeAuthenticator: 3107 authenticators = { 3108 name: self._create_component_from_model(model=auth, config=config) 3109 for name, auth in model.authenticators.items() 3110 } 3111 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3112 return SelectiveAuthenticator( # type: ignore[abstract] 3113 config=config, 3114 authenticators=authenticators, 3115 authenticator_selection_path=model.authenticator_selection_path, 3116 **kwargs, 3117 ) 3118 3119 @staticmethod 3120 def create_legacy_session_token_authenticator( 3121 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3122 ) -> LegacySessionTokenAuthenticator: 3123 return LegacySessionTokenAuthenticator( 3124 api_url=url_base, 3125 header=model.header, 3126 login_url=model.login_url, 3127 password=model.password or "", 3128 session_token=model.session_token or "", 3129 session_token_response_key=model.session_token_response_key or "", 3130 username=model.username or "", 3131 validate_session_url=model.validate_session_url, 3132 config=config, 3133 parameters=model.parameters or {}, 3134 ) 3135 3136 def create_simple_retriever( 3137 self, 3138 model: SimpleRetrieverModel, 3139 config: Config, 3140 *, 3141 name: str, 3142 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3143 stream_slicer: Optional[StreamSlicer], 3144 request_options_provider: Optional[RequestOptionsProvider] = None, 3145 stop_condition_on_cursor: bool = False, 3146 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3147 transformations: List[RecordTransformation], 3148 file_uploader: Optional[DefaultFileUploader] = None, 3149 incremental_sync: Optional[ 3150 Union[ 3151 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 3152 ] 3153 ] = None, 3154 use_cache: Optional[bool] = None, 3155 log_formatter: Optional[Callable[[Response], Any]] = None, 3156 **kwargs: Any, 3157 ) -> SimpleRetriever: 3158 def _get_url() -> str: 3159 """ 3160 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3161 This is needed because the URL is not set until the requester is created. 3162 """ 3163 3164 _url: str = ( 3165 model.requester.url 3166 if hasattr(model.requester, "url") and model.requester.url is not None 3167 else requester.get_url() 3168 ) 3169 _url_base: str = ( 3170 model.requester.url_base 3171 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3172 else requester.get_url_base() 3173 ) 3174 3175 return _url or _url_base 3176 3177 decoder = ( 3178 self._create_component_from_model(model=model.decoder, config=config) 3179 if model.decoder 3180 else JsonDecoder(parameters={}) 3181 ) 3182 record_selector = self._create_component_from_model( 3183 model=model.record_selector, 3184 name=name, 3185 config=config, 3186 decoder=decoder, 3187 transformations=transformations, 3188 client_side_incremental_sync=client_side_incremental_sync, 3189 file_uploader=file_uploader, 3190 ) 3191 3192 query_properties: Optional[QueryProperties] = None 3193 query_properties_key: Optional[str] = None 3194 if self._query_properties_in_request_parameters(model.requester): 3195 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3196 # places instead of default to request_parameters which isn't clearly documented 3197 if ( 3198 hasattr(model.requester, "fetch_properties_from_endpoint") 3199 and model.requester.fetch_properties_from_endpoint 3200 ): 3201 raise ValueError( 3202 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3203 ) 3204 3205 query_properties_definitions = [] 3206 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3207 if isinstance(request_parameter, QueryPropertiesModel): 3208 query_properties_key = key 3209 query_properties_definitions.append(request_parameter) 3210 3211 if len(query_properties_definitions) > 1: 3212 raise ValueError( 3213 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3214 ) 3215 3216 if len(query_properties_definitions) == 1: 3217 query_properties = self._create_component_from_model( 3218 model=query_properties_definitions[0], config=config 3219 ) 3220 elif ( 3221 hasattr(model.requester, "fetch_properties_from_endpoint") 3222 and model.requester.fetch_properties_from_endpoint 3223 ): 3224 query_properties_definition = QueryPropertiesModel( 3225 type="QueryProperties", 3226 property_list=model.requester.fetch_properties_from_endpoint, 3227 always_include_properties=None, 3228 property_chunking=None, 3229 ) # type: ignore # $parameters has a default value 3230 3231 query_properties = self.create_query_properties( 3232 model=query_properties_definition, 3233 config=config, 3234 ) 3235 3236 requester = self._create_component_from_model( 3237 model=model.requester, 3238 decoder=decoder, 3239 name=name, 3240 query_properties_key=query_properties_key, 3241 use_cache=use_cache, 3242 config=config, 3243 ) 3244 3245 # Define cursor only if per partition or common incremental support is needed 3246 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3247 3248 if ( 3249 not isinstance(stream_slicer, DatetimeBasedCursor) 3250 or type(stream_slicer) is not DatetimeBasedCursor 3251 ): 3252 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3253 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3254 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3255 # request_options_provider 3256 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3257 elif not request_options_provider: 3258 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3259 3260 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3261 if self._should_limit_slices_fetched(): 3262 stream_slicer = cast( 3263 StreamSlicer, 3264 StreamSlicerTestReadDecorator( 3265 wrapped_slicer=stream_slicer, 3266 maximum_number_of_slices=self._limit_slices_fetched or 5, 3267 ), 3268 ) 3269 3270 cursor_used_for_stop_condition = cursor if stop_condition_on_cursor else None 3271 paginator = ( 3272 self._create_component_from_model( 3273 model=model.paginator, 3274 config=config, 3275 url_base=_get_url(), 3276 extractor_model=model.record_selector.extractor, 3277 decoder=decoder, 3278 cursor_used_for_stop_condition=cursor_used_for_stop_condition, 3279 ) 3280 if model.paginator 3281 else NoPagination(parameters={}) 3282 ) 3283 3284 ignore_stream_slicer_parameters_on_paginated_requests = ( 3285 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3286 ) 3287 3288 if ( 3289 model.partition_router 3290 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3291 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3292 and any( 3293 parent_stream_config.lazy_read_pointer 3294 for parent_stream_config in model.partition_router.parent_stream_configs 3295 ) 3296 ): 3297 if incremental_sync: 3298 if incremental_sync.type != "DatetimeBasedCursor": 3299 raise ValueError( 3300 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3301 ) 3302 3303 elif incremental_sync.step or incremental_sync.cursor_granularity: 3304 raise ValueError( 3305 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3306 ) 3307 3308 if model.decoder and model.decoder.type != "JsonDecoder": 3309 raise ValueError( 3310 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3311 ) 3312 3313 return LazySimpleRetriever( 3314 name=name, 3315 paginator=paginator, 3316 primary_key=primary_key, 3317 requester=requester, 3318 record_selector=record_selector, 3319 stream_slicer=stream_slicer, 3320 request_option_provider=request_options_provider, 3321 cursor=cursor, 3322 config=config, 3323 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3324 parameters=model.parameters or {}, 3325 ) 3326 3327 return SimpleRetriever( 3328 name=name, 3329 paginator=paginator, 3330 primary_key=primary_key, 3331 requester=requester, 3332 record_selector=record_selector, 3333 stream_slicer=stream_slicer, 3334 request_option_provider=request_options_provider, 3335 cursor=cursor, 3336 config=config, 3337 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3338 additional_query_properties=query_properties, 3339 log_formatter=self._get_log_formatter(log_formatter, name), 3340 parameters=model.parameters or {}, 3341 ) 3342 3343 def _get_log_formatter( 3344 self, log_formatter: Callable[[Response], Any] | None, name: str 3345 ) -> Callable[[Response], Any] | None: 3346 if self._should_limit_slices_fetched(): 3347 return ( 3348 ( 3349 lambda response: format_http_message( 3350 response, 3351 f"Stream '{name}' request", 3352 f"Request performed in order to extract records for stream '{name}'", 3353 name, 3354 ) 3355 ) 3356 if not log_formatter 3357 else log_formatter 3358 ) 3359 return None 3360 3361 def _should_limit_slices_fetched(self) -> bool: 3362 """ 3363 Returns True if the number of slices fetched should be limited, False otherwise. 3364 This is used to limit the number of slices fetched during tests. 3365 """ 3366 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3367 3368 @staticmethod 3369 def _query_properties_in_request_parameters( 3370 requester: Union[HttpRequesterModel, CustomRequesterModel], 3371 ) -> bool: 3372 if not hasattr(requester, "request_parameters"): 3373 return False 3374 request_parameters = requester.request_parameters 3375 if request_parameters and isinstance(request_parameters, Mapping): 3376 for request_parameter in request_parameters.values(): 3377 if isinstance(request_parameter, QueryPropertiesModel): 3378 return True 3379 return False 3380 3381 @staticmethod 3382 def _remove_query_properties( 3383 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3384 ) -> Mapping[str, str]: 3385 return { 3386 parameter_field: request_parameter 3387 for parameter_field, request_parameter in request_parameters.items() 3388 if not isinstance(request_parameter, QueryPropertiesModel) 3389 } 3390 3391 def create_state_delegating_stream( 3392 self, 3393 model: StateDelegatingStreamModel, 3394 config: Config, 3395 has_parent_state: Optional[bool] = None, 3396 **kwargs: Any, 3397 ) -> DeclarativeStream: 3398 if ( 3399 model.full_refresh_stream.name != model.name 3400 or model.name != model.incremental_stream.name 3401 ): 3402 raise ValueError( 3403 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3404 ) 3405 3406 stream_model = ( 3407 model.incremental_stream 3408 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3409 else model.full_refresh_stream 3410 ) 3411 3412 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description 3413 3414 def _create_async_job_status_mapping( 3415 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3416 ) -> Mapping[str, AsyncJobStatus]: 3417 api_status_to_cdk_status = {} 3418 for cdk_status, api_statuses in model.dict().items(): 3419 if cdk_status == "type": 3420 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3421 continue 3422 3423 for status in api_statuses: 3424 if status in api_status_to_cdk_status: 3425 raise ValueError( 3426 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3427 ) 3428 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3429 return api_status_to_cdk_status 3430 3431 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3432 match status: 3433 case "running": 3434 return AsyncJobStatus.RUNNING 3435 case "completed": 3436 return AsyncJobStatus.COMPLETED 3437 case "failed": 3438 return AsyncJobStatus.FAILED 3439 case "timeout": 3440 return AsyncJobStatus.TIMED_OUT 3441 case _: 3442 raise ValueError(f"Unsupported CDK status {status}") 3443 3444 def create_async_retriever( 3445 self, 3446 model: AsyncRetrieverModel, 3447 config: Config, 3448 *, 3449 name: str, 3450 primary_key: Optional[ 3451 Union[str, List[str], List[List[str]]] 3452 ], # this seems to be needed to match create_simple_retriever 3453 stream_slicer: Optional[StreamSlicer], 3454 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3455 transformations: List[RecordTransformation], 3456 **kwargs: Any, 3457 ) -> AsyncRetriever: 3458 def _get_download_retriever() -> SimpleRetriever: 3459 # We create a record selector for the download retriever 3460 # with no schema normalization and no transformations, neither record filter 3461 # as all this occurs in the record_selector of the AsyncRetriever 3462 record_selector = RecordSelector( 3463 extractor=download_extractor, 3464 name=name, 3465 record_filter=None, 3466 transformations=[], 3467 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3468 config=config, 3469 parameters={}, 3470 ) 3471 paginator = ( 3472 self._create_component_from_model( 3473 model=model.download_paginator, 3474 decoder=decoder, 3475 config=config, 3476 url_base="", 3477 ) 3478 if model.download_paginator 3479 else NoPagination(parameters={}) 3480 ) 3481 3482 return SimpleRetriever( 3483 requester=download_requester, 3484 record_selector=record_selector, 3485 primary_key=None, 3486 name=job_download_components_name, 3487 paginator=paginator, 3488 config=config, 3489 parameters={}, 3490 ) 3491 3492 def _get_job_timeout() -> datetime.timedelta: 3493 user_defined_timeout: Optional[int] = ( 3494 int( 3495 InterpolatedString.create( 3496 str(model.polling_job_timeout), 3497 parameters={}, 3498 ).eval(config) 3499 ) 3500 if model.polling_job_timeout 3501 else None 3502 ) 3503 3504 # check for user defined timeout during the test read or 15 minutes 3505 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3506 # default value for non-connector builder is 60 minutes. 3507 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3508 3509 return ( 3510 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3511 ) 3512 3513 decoder = ( 3514 self._create_component_from_model(model=model.decoder, config=config) 3515 if model.decoder 3516 else JsonDecoder(parameters={}) 3517 ) 3518 record_selector = self._create_component_from_model( 3519 model=model.record_selector, 3520 config=config, 3521 decoder=decoder, 3522 name=name, 3523 transformations=transformations, 3524 client_side_incremental_sync=client_side_incremental_sync, 3525 ) 3526 3527 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3528 if self._should_limit_slices_fetched(): 3529 stream_slicer = cast( 3530 StreamSlicer, 3531 StreamSlicerTestReadDecorator( 3532 wrapped_slicer=stream_slicer, 3533 maximum_number_of_slices=self._limit_slices_fetched or 5, 3534 ), 3535 ) 3536 3537 creation_requester = self._create_component_from_model( 3538 model=model.creation_requester, 3539 decoder=decoder, 3540 config=config, 3541 name=f"job creation - {name}", 3542 ) 3543 polling_requester = self._create_component_from_model( 3544 model=model.polling_requester, 3545 decoder=decoder, 3546 config=config, 3547 name=f"job polling - {name}", 3548 ) 3549 job_download_components_name = f"job download - {name}" 3550 download_decoder = ( 3551 self._create_component_from_model(model=model.download_decoder, config=config) 3552 if model.download_decoder 3553 else JsonDecoder(parameters={}) 3554 ) 3555 download_extractor = ( 3556 self._create_component_from_model( 3557 model=model.download_extractor, 3558 config=config, 3559 decoder=download_decoder, 3560 parameters=model.parameters, 3561 ) 3562 if model.download_extractor 3563 else DpathExtractor( 3564 [], 3565 config=config, 3566 decoder=download_decoder, 3567 parameters=model.parameters or {}, 3568 ) 3569 ) 3570 download_requester = self._create_component_from_model( 3571 model=model.download_requester, 3572 decoder=download_decoder, 3573 config=config, 3574 name=job_download_components_name, 3575 ) 3576 download_retriever = _get_download_retriever() 3577 abort_requester = ( 3578 self._create_component_from_model( 3579 model=model.abort_requester, 3580 decoder=decoder, 3581 config=config, 3582 name=f"job abort - {name}", 3583 ) 3584 if model.abort_requester 3585 else None 3586 ) 3587 delete_requester = ( 3588 self._create_component_from_model( 3589 model=model.delete_requester, 3590 decoder=decoder, 3591 config=config, 3592 name=f"job delete - {name}", 3593 ) 3594 if model.delete_requester 3595 else None 3596 ) 3597 download_target_requester = ( 3598 self._create_component_from_model( 3599 model=model.download_target_requester, 3600 decoder=decoder, 3601 config=config, 3602 name=f"job extract_url - {name}", 3603 ) 3604 if model.download_target_requester 3605 else None 3606 ) 3607 status_extractor = self._create_component_from_model( 3608 model=model.status_extractor, decoder=decoder, config=config, name=name 3609 ) 3610 download_target_extractor = self._create_component_from_model( 3611 model=model.download_target_extractor, 3612 decoder=decoder, 3613 config=config, 3614 name=name, 3615 ) 3616 3617 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3618 creation_requester=creation_requester, 3619 polling_requester=polling_requester, 3620 download_retriever=download_retriever, 3621 download_target_requester=download_target_requester, 3622 abort_requester=abort_requester, 3623 delete_requester=delete_requester, 3624 status_extractor=status_extractor, 3625 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3626 download_target_extractor=download_target_extractor, 3627 job_timeout=_get_job_timeout(), 3628 ) 3629 3630 async_job_partition_router = AsyncJobPartitionRouter( 3631 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3632 job_repository, 3633 stream_slices, 3634 self._job_tracker, 3635 self._message_repository, 3636 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3637 has_bulk_parent=False, 3638 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3639 # `None` == default retry is set to 3 attempts, under the hood. 3640 job_max_retry=1 if self._emit_connector_builder_messages else None, 3641 ), 3642 stream_slicer=stream_slicer, 3643 config=config, 3644 parameters=model.parameters or {}, 3645 ) 3646 3647 return AsyncRetriever( 3648 record_selector=record_selector, 3649 stream_slicer=async_job_partition_router, 3650 config=config, 3651 parameters=model.parameters or {}, 3652 ) 3653 3654 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3655 config_migrations = [ 3656 self._create_component_from_model(migration, config) 3657 for migration in ( 3658 model.config_normalization_rules.config_migrations 3659 if ( 3660 model.config_normalization_rules 3661 and model.config_normalization_rules.config_migrations 3662 ) 3663 else [] 3664 ) 3665 ] 3666 config_transformations = [ 3667 self._create_component_from_model(transformation, config) 3668 for transformation in ( 3669 model.config_normalization_rules.transformations 3670 if ( 3671 model.config_normalization_rules 3672 and model.config_normalization_rules.transformations 3673 ) 3674 else [] 3675 ) 3676 ] 3677 config_validations = [ 3678 self._create_component_from_model(validation, config) 3679 for validation in ( 3680 model.config_normalization_rules.validations 3681 if ( 3682 model.config_normalization_rules 3683 and model.config_normalization_rules.validations 3684 ) 3685 else [] 3686 ) 3687 ] 3688 3689 return Spec( 3690 connection_specification=model.connection_specification, 3691 documentation_url=model.documentation_url, 3692 advanced_auth=model.advanced_auth, 3693 parameters={}, 3694 config_migrations=config_migrations, 3695 config_transformations=config_transformations, 3696 config_validations=config_validations, 3697 ) 3698 3699 def create_substream_partition_router( 3700 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3701 ) -> SubstreamPartitionRouter: 3702 parent_stream_configs = [] 3703 if model.parent_stream_configs: 3704 parent_stream_configs.extend( 3705 [ 3706 self._create_message_repository_substream_wrapper( 3707 model=parent_stream_config, config=config, **kwargs 3708 ) 3709 for parent_stream_config in model.parent_stream_configs 3710 ] 3711 ) 3712 3713 return SubstreamPartitionRouter( 3714 parent_stream_configs=parent_stream_configs, 3715 parameters=model.parameters or {}, 3716 config=config, 3717 ) 3718 3719 def _create_message_repository_substream_wrapper( 3720 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 3721 ) -> Any: 3722 substream_factory = ModelToComponentFactory( 3723 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3724 limit_slices_fetched=self._limit_slices_fetched, 3725 emit_connector_builder_messages=self._emit_connector_builder_messages, 3726 disable_retries=self._disable_retries, 3727 disable_cache=self._disable_cache, 3728 message_repository=LogAppenderMessageRepositoryDecorator( 3729 {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}}, 3730 self._message_repository, 3731 self._evaluate_log_level(self._emit_connector_builder_messages), 3732 ), 3733 ) 3734 3735 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3736 has_parent_state = bool( 3737 self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None) 3738 if model.incremental_dependency 3739 else False 3740 ) 3741 return substream_factory._create_component_from_model( 3742 model=model, config=config, has_parent_state=has_parent_state, **kwargs 3743 ) 3744 3745 @staticmethod 3746 def create_wait_time_from_header( 3747 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3748 ) -> WaitTimeFromHeaderBackoffStrategy: 3749 return WaitTimeFromHeaderBackoffStrategy( 3750 header=model.header, 3751 parameters=model.parameters or {}, 3752 config=config, 3753 regex=model.regex, 3754 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3755 if model.max_waiting_time_in_seconds is not None 3756 else None, 3757 ) 3758 3759 @staticmethod 3760 def create_wait_until_time_from_header( 3761 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3762 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3763 return WaitUntilTimeFromHeaderBackoffStrategy( 3764 header=model.header, 3765 parameters=model.parameters or {}, 3766 config=config, 3767 min_wait=model.min_wait, 3768 regex=model.regex, 3769 ) 3770 3771 def get_message_repository(self) -> MessageRepository: 3772 return self._message_repository 3773 3774 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3775 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3776 3777 @staticmethod 3778 def create_components_mapping_definition( 3779 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3780 ) -> ComponentMappingDefinition: 3781 interpolated_value = InterpolatedString.create( 3782 model.value, parameters=model.parameters or {} 3783 ) 3784 field_path = [ 3785 InterpolatedString.create(path, parameters=model.parameters or {}) 3786 for path in model.field_path 3787 ] 3788 return ComponentMappingDefinition( 3789 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3790 value=interpolated_value, 3791 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3792 create_or_update=model.create_or_update, 3793 parameters=model.parameters or {}, 3794 ) 3795 3796 def create_http_components_resolver( 3797 self, model: HttpComponentsResolverModel, config: Config 3798 ) -> Any: 3799 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3800 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3801 3802 retriever = self._create_component_from_model( 3803 model=model.retriever, 3804 config=config, 3805 name="", 3806 primary_key=None, 3807 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3808 transformations=[], 3809 ) 3810 3811 components_mapping = [ 3812 self._create_component_from_model( 3813 model=components_mapping_definition_model, 3814 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3815 components_mapping_definition_model.value_type 3816 ), 3817 config=config, 3818 ) 3819 for components_mapping_definition_model in model.components_mapping 3820 ] 3821 3822 return HttpComponentsResolver( 3823 retriever=retriever, 3824 config=config, 3825 components_mapping=components_mapping, 3826 parameters=model.parameters or {}, 3827 ) 3828 3829 @staticmethod 3830 def create_stream_config( 3831 model: StreamConfigModel, config: Config, **kwargs: Any 3832 ) -> StreamConfig: 3833 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3834 [x for x in model.configs_pointer] if model.configs_pointer else [] 3835 ) 3836 3837 return StreamConfig( 3838 configs_pointer=model_configs_pointer, 3839 default_values=model.default_values, 3840 parameters=model.parameters or {}, 3841 ) 3842 3843 def create_config_components_resolver( 3844 self, model: ConfigComponentsResolverModel, config: Config 3845 ) -> Any: 3846 model_stream_configs = ( 3847 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3848 ) 3849 3850 stream_configs = [ 3851 self._create_component_from_model( 3852 stream_config, config=config, parameters=model.parameters or {} 3853 ) 3854 for stream_config in model_stream_configs 3855 ] 3856 3857 components_mapping = [ 3858 self._create_component_from_model( 3859 model=components_mapping_definition_model, 3860 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3861 components_mapping_definition_model.value_type 3862 ), 3863 config=config, 3864 ) 3865 for components_mapping_definition_model in model.components_mapping 3866 ] 3867 3868 return ConfigComponentsResolver( 3869 stream_configs=stream_configs, 3870 config=config, 3871 components_mapping=components_mapping, 3872 parameters=model.parameters or {}, 3873 ) 3874 3875 def create_parametrized_components_resolver( 3876 self, model: ParametrizedComponentsResolverModel, config: Config 3877 ) -> ParametrizedComponentsResolver: 3878 stream_parameters = StreamParametersDefinition( 3879 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 3880 ) 3881 components_mapping = [ 3882 self._create_component_from_model( 3883 model=components_mapping_definition_model, 3884 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3885 components_mapping_definition_model.value_type 3886 ), 3887 config=config, 3888 ) 3889 for components_mapping_definition_model in model.components_mapping 3890 ] 3891 return ParametrizedComponentsResolver( 3892 stream_parameters=stream_parameters, 3893 config=config, 3894 components_mapping=components_mapping, 3895 parameters=model.parameters or {}, 3896 ) 3897 3898 _UNSUPPORTED_DECODER_ERROR = ( 3899 "Specified decoder of {decoder_type} is not supported for pagination." 3900 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 3901 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 3902 ) 3903 3904 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 3905 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 3906 return True 3907 elif isinstance(decoder, CompositeRawDecoder): 3908 return self._is_supported_parser_for_pagination(decoder.parser) 3909 else: 3910 return False 3911 3912 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 3913 if isinstance(parser, JsonParser): 3914 return True 3915 elif isinstance(parser, GzipParser): 3916 return isinstance(parser.inner_parser, JsonParser) 3917 else: 3918 return False 3919 3920 def create_http_api_budget( 3921 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3922 ) -> HttpAPIBudget: 3923 policies = [ 3924 self._create_component_from_model(model=policy, config=config) 3925 for policy in model.policies 3926 ] 3927 3928 return HttpAPIBudget( 3929 policies=policies, 3930 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 3931 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 3932 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 3933 ) 3934 3935 def create_fixed_window_call_rate_policy( 3936 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 3937 ) -> FixedWindowCallRatePolicy: 3938 matchers = [ 3939 self._create_component_from_model(model=matcher, config=config) 3940 for matcher in model.matchers 3941 ] 3942 3943 # Set the initial reset timestamp to 10 days from now. 3944 # This value will be updated by the first request. 3945 return FixedWindowCallRatePolicy( 3946 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 3947 period=parse_duration(model.period), 3948 call_limit=model.call_limit, 3949 matchers=matchers, 3950 ) 3951 3952 def create_file_uploader( 3953 self, model: FileUploaderModel, config: Config, **kwargs: Any 3954 ) -> FileUploader: 3955 name = "File Uploader" 3956 requester = self._create_component_from_model( 3957 model=model.requester, 3958 config=config, 3959 name=name, 3960 **kwargs, 3961 ) 3962 download_target_extractor = self._create_component_from_model( 3963 model=model.download_target_extractor, 3964 config=config, 3965 name=name, 3966 **kwargs, 3967 ) 3968 emit_connector_builder_messages = self._emit_connector_builder_messages 3969 file_uploader = DefaultFileUploader( 3970 requester=requester, 3971 download_target_extractor=download_target_extractor, 3972 config=config, 3973 file_writer=NoopFileWriter() 3974 if emit_connector_builder_messages 3975 else LocalFileSystemFileWriter(), 3976 parameters=model.parameters or {}, 3977 filename_extractor=model.filename_extractor if model.filename_extractor else None, 3978 ) 3979 3980 return ( 3981 ConnectorBuilderFileUploader(file_uploader) 3982 if emit_connector_builder_messages 3983 else file_uploader 3984 ) 3985 3986 def create_moving_window_call_rate_policy( 3987 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 3988 ) -> MovingWindowCallRatePolicy: 3989 rates = [ 3990 self._create_component_from_model(model=rate, config=config) for rate in model.rates 3991 ] 3992 matchers = [ 3993 self._create_component_from_model(model=matcher, config=config) 3994 for matcher in model.matchers 3995 ] 3996 return MovingWindowCallRatePolicy( 3997 rates=rates, 3998 matchers=matchers, 3999 ) 4000 4001 def create_unlimited_call_rate_policy( 4002 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4003 ) -> UnlimitedCallRatePolicy: 4004 matchers = [ 4005 self._create_component_from_model(model=matcher, config=config) 4006 for matcher in model.matchers 4007 ] 4008 4009 return UnlimitedCallRatePolicy( 4010 matchers=matchers, 4011 ) 4012 4013 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4014 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4015 return Rate( 4016 limit=int(interpolated_limit.eval(config=config)), 4017 interval=parse_duration(model.interval), 4018 ) 4019 4020 def create_http_request_matcher( 4021 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4022 ) -> HttpRequestRegexMatcher: 4023 return HttpRequestRegexMatcher( 4024 method=model.method, 4025 url_base=model.url_base, 4026 url_path_pattern=model.url_path_pattern, 4027 params=model.params, 4028 headers=model.headers, 4029 ) 4030 4031 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4032 self._api_budget = self.create_component( 4033 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4034 ) 4035 4036 def create_grouping_partition_router( 4037 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 4038 ) -> GroupingPartitionRouter: 4039 underlying_router = self._create_component_from_model( 4040 model=model.underlying_partition_router, config=config 4041 ) 4042 if model.group_size < 1: 4043 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4044 4045 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4046 # because they are specific to individual partitions and cannot be aggregated or handled 4047 # when grouping, potentially leading to incorrect API calls. Any request customization 4048 # should be managed at the stream level through the requester's configuration. 4049 if isinstance(underlying_router, SubstreamPartitionRouter): 4050 if any( 4051 parent_config.request_option 4052 for parent_config in underlying_router.parent_stream_configs 4053 ): 4054 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4055 4056 if isinstance(underlying_router, ListPartitionRouter): 4057 if underlying_router.request_option: 4058 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4059 4060 return GroupingPartitionRouter( 4061 group_size=model.group_size, 4062 underlying_partition_router=underlying_router, 4063 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4064 config=config, 4065 )
628 def __init__( 629 self, 630 limit_pages_fetched_per_slice: Optional[int] = None, 631 limit_slices_fetched: Optional[int] = None, 632 emit_connector_builder_messages: bool = False, 633 disable_retries: bool = False, 634 disable_cache: bool = False, 635 disable_resumable_full_refresh: bool = False, 636 message_repository: Optional[MessageRepository] = None, 637 connector_state_manager: Optional[ConnectorStateManager] = None, 638 max_concurrent_async_job_count: Optional[int] = None, 639 ): 640 self._init_mappings() 641 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 642 self._limit_slices_fetched = limit_slices_fetched 643 self._emit_connector_builder_messages = emit_connector_builder_messages 644 self._disable_retries = disable_retries 645 self._disable_cache = disable_cache 646 self._disable_resumable_full_refresh = disable_resumable_full_refresh 647 self._message_repository = message_repository or InMemoryMessageRepository( 648 self._evaluate_log_level(emit_connector_builder_messages) 649 ) 650 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 651 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 652 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 653 # placeholder for deprecation warnings 654 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = []
768 def create_component( 769 self, 770 model_type: Type[BaseModel], 771 component_definition: ComponentDefinition, 772 config: Config, 773 **kwargs: Any, 774 ) -> Any: 775 """ 776 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 777 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 778 creating declarative components from that model. 779 780 :param model_type: The type of declarative component that is being initialized 781 :param component_definition: The mapping that represents a declarative component 782 :param config: The connector config that is provided by the customer 783 :return: The declarative component to be used at runtime 784 """ 785 786 component_type = component_definition.get("type") 787 if component_definition.get("type") != model_type.__name__: 788 raise ValueError( 789 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 790 ) 791 792 declarative_component_model = model_type.parse_obj(component_definition) 793 794 if not isinstance(declarative_component_model, model_type): 795 raise ValueError( 796 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 797 ) 798 799 return self._create_component_from_model( 800 model=declarative_component_model, config=config, **kwargs 801 )
Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating creating declarative components from that model.
Parameters
- model_type: The type of declarative component that is being initialized
- component_definition: The mapping that represents a declarative component
- config: The connector config that is provided by the customer
Returns
The declarative component to be used at runtime
818 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 819 """ 820 Returns the deprecation warnings that were collected during the creation of components. 821 """ 822 return self._collected_deprecation_logs
Returns the deprecation warnings that were collected during the creation of components.
839 def create_config_migration( 840 self, model: ConfigMigrationModel, config: Config 841 ) -> ConfigMigration: 842 transformations: List[ConfigTransformation] = [ 843 self._create_component_from_model(transformation, config) 844 for transformation in model.transformations 845 ] 846 847 return ConfigMigration( 848 description=model.description, 849 transformations=transformations, 850 )
852 def create_config_add_fields( 853 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 854 ) -> ConfigAddFields: 855 fields = [self._create_component_from_model(field, config) for field in model.fields] 856 return ConfigAddFields( 857 fields=fields, 858 condition=model.condition or "", 859 )
908 @staticmethod 909 def create_added_field_definition( 910 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 911 ) -> AddedFieldDefinition: 912 interpolated_value = InterpolatedString.create( 913 model.value, parameters=model.parameters or {} 914 ) 915 return AddedFieldDefinition( 916 path=model.path, 917 value=interpolated_value, 918 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 919 parameters=model.parameters or {}, 920 )
922 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 923 added_field_definitions = [ 924 self._create_component_from_model( 925 model=added_field_definition_model, 926 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 927 added_field_definition_model.value_type 928 ), 929 config=config, 930 ) 931 for added_field_definition_model in model.fields 932 ] 933 return AddFields( 934 fields=added_field_definitions, 935 condition=model.condition or "", 936 parameters=model.parameters or {}, 937 )
963 def create_dpath_flatten_fields( 964 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 965 ) -> DpathFlattenFields: 966 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 967 key_transformation = ( 968 KeyTransformation( 969 config=config, 970 prefix=model.key_transformation.prefix, 971 suffix=model.key_transformation.suffix, 972 parameters=model.parameters or {}, 973 ) 974 if model.key_transformation is not None 975 else None 976 ) 977 return DpathFlattenFields( 978 config=config, 979 field_path=model_field_path, 980 delete_origin_value=model.delete_origin_value 981 if model.delete_origin_value is not None 982 else False, 983 replace_record=model.replace_record if model.replace_record is not None else False, 984 key_transformation=key_transformation, 985 parameters=model.parameters or {}, 986 )
1000 def create_api_key_authenticator( 1001 self, 1002 model: ApiKeyAuthenticatorModel, 1003 config: Config, 1004 token_provider: Optional[TokenProvider] = None, 1005 **kwargs: Any, 1006 ) -> ApiKeyAuthenticator: 1007 if model.inject_into is None and model.header is None: 1008 raise ValueError( 1009 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1010 ) 1011 1012 if model.inject_into is not None and model.header is not None: 1013 raise ValueError( 1014 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1015 ) 1016 1017 if token_provider is not None and model.api_token != "": 1018 raise ValueError( 1019 "If token_provider is set, api_token is ignored and has to be set to empty string." 1020 ) 1021 1022 request_option = ( 1023 self._create_component_from_model( 1024 model.inject_into, config, parameters=model.parameters or {} 1025 ) 1026 if model.inject_into 1027 else RequestOption( 1028 inject_into=RequestOptionType.header, 1029 field_name=model.header or "", 1030 parameters=model.parameters or {}, 1031 ) 1032 ) 1033 1034 return ApiKeyAuthenticator( 1035 token_provider=( 1036 token_provider 1037 if token_provider is not None 1038 else InterpolatedStringTokenProvider( 1039 api_token=model.api_token or "", 1040 config=config, 1041 parameters=model.parameters or {}, 1042 ) 1043 ), 1044 request_option=request_option, 1045 config=config, 1046 parameters=model.parameters or {}, 1047 )
1049 def create_legacy_to_per_partition_state_migration( 1050 self, 1051 model: LegacyToPerPartitionStateMigrationModel, 1052 config: Mapping[str, Any], 1053 declarative_stream: DeclarativeStreamModel, 1054 ) -> LegacyToPerPartitionStateMigration: 1055 retriever = declarative_stream.retriever 1056 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1057 raise ValueError( 1058 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1059 ) 1060 partition_router = retriever.partition_router 1061 if not isinstance( 1062 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1063 ): 1064 raise ValueError( 1065 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1066 ) 1067 if not hasattr(partition_router, "parent_stream_configs"): 1068 raise ValueError( 1069 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1070 ) 1071 1072 if not hasattr(declarative_stream, "incremental_sync"): 1073 raise ValueError( 1074 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1075 ) 1076 1077 return LegacyToPerPartitionStateMigration( 1078 partition_router, # type: ignore # was already checked above 1079 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1080 config, 1081 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1082 )
1084 def create_session_token_authenticator( 1085 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1086 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1087 decoder = ( 1088 self._create_component_from_model(model=model.decoder, config=config) 1089 if model.decoder 1090 else JsonDecoder(parameters={}) 1091 ) 1092 login_requester = self._create_component_from_model( 1093 model=model.login_requester, 1094 config=config, 1095 name=f"{name}_login_requester", 1096 decoder=decoder, 1097 ) 1098 token_provider = SessionTokenProvider( 1099 login_requester=login_requester, 1100 session_token_path=model.session_token_path, 1101 expiration_duration=parse_duration(model.expiration_duration) 1102 if model.expiration_duration 1103 else None, 1104 parameters=model.parameters or {}, 1105 message_repository=self._message_repository, 1106 decoder=decoder, 1107 ) 1108 if model.request_authentication.type == "Bearer": 1109 return ModelToComponentFactory.create_bearer_authenticator( 1110 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1111 config, 1112 token_provider=token_provider, 1113 ) 1114 else: 1115 return self.create_api_key_authenticator( 1116 ApiKeyAuthenticatorModel( 1117 type="ApiKeyAuthenticator", 1118 api_token="", 1119 inject_into=model.request_authentication.inject_into, 1120 ), # type: ignore # $parameters and headers default to None 1121 config=config, 1122 token_provider=token_provider, 1123 )
1125 @staticmethod 1126 def create_basic_http_authenticator( 1127 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1128 ) -> BasicHttpAuthenticator: 1129 return BasicHttpAuthenticator( 1130 password=model.password or "", 1131 username=model.username, 1132 config=config, 1133 parameters=model.parameters or {}, 1134 )
1136 @staticmethod 1137 def create_bearer_authenticator( 1138 model: BearerAuthenticatorModel, 1139 config: Config, 1140 token_provider: Optional[TokenProvider] = None, 1141 **kwargs: Any, 1142 ) -> BearerAuthenticator: 1143 if token_provider is not None and model.api_token != "": 1144 raise ValueError( 1145 "If token_provider is set, api_token is ignored and has to be set to empty string." 1146 ) 1147 return BearerAuthenticator( 1148 token_provider=( 1149 token_provider 1150 if token_provider is not None 1151 else InterpolatedStringTokenProvider( 1152 api_token=model.api_token or "", 1153 config=config, 1154 parameters=model.parameters or {}, 1155 ) 1156 ), 1157 config=config, 1158 parameters=model.parameters or {}, 1159 )
1161 @staticmethod 1162 def create_dynamic_stream_check_config( 1163 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1164 ) -> DynamicStreamCheckConfig: 1165 return DynamicStreamCheckConfig( 1166 dynamic_stream_name=model.dynamic_stream_name, 1167 stream_count=model.stream_count or 0, 1168 )
1170 def create_check_stream( 1171 self, model: CheckStreamModel, config: Config, **kwargs: Any 1172 ) -> CheckStream: 1173 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1174 raise ValueError( 1175 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1176 ) 1177 1178 dynamic_streams_check_configs = ( 1179 [ 1180 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1181 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1182 ] 1183 if model.dynamic_streams_check_configs 1184 else [] 1185 ) 1186 1187 return CheckStream( 1188 stream_names=model.stream_names or [], 1189 dynamic_streams_check_configs=dynamic_streams_check_configs, 1190 parameters={}, 1191 )
1193 @staticmethod 1194 def create_check_dynamic_stream( 1195 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1196 ) -> CheckDynamicStream: 1197 assert model.use_check_availability is not None # for mypy 1198 1199 use_check_availability = model.use_check_availability 1200 1201 return CheckDynamicStream( 1202 stream_count=model.stream_count, 1203 use_check_availability=use_check_availability, 1204 parameters={}, 1205 )
1207 def create_composite_error_handler( 1208 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1209 ) -> CompositeErrorHandler: 1210 error_handlers = [ 1211 self._create_component_from_model(model=error_handler_model, config=config) 1212 for error_handler_model in model.error_handlers 1213 ] 1214 return CompositeErrorHandler( 1215 error_handlers=error_handlers, parameters=model.parameters or {} 1216 )
1218 @staticmethod 1219 def create_concurrency_level( 1220 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1221 ) -> ConcurrencyLevel: 1222 return ConcurrencyLevel( 1223 default_concurrency=model.default_concurrency, 1224 max_concurrency=model.max_concurrency, 1225 config=config, 1226 parameters={}, 1227 )
1229 @staticmethod 1230 def apply_stream_state_migrations( 1231 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1232 ) -> MutableMapping[str, Any]: 1233 if stream_state_migrations: 1234 for state_migration in stream_state_migrations: 1235 if state_migration.should_migrate(stream_state): 1236 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1237 stream_state = dict(state_migration.migrate(stream_state)) 1238 return stream_state
1240 def create_concurrent_cursor_from_datetime_based_cursor( 1241 self, 1242 model_type: Type[BaseModel], 1243 component_definition: ComponentDefinition, 1244 stream_name: str, 1245 stream_namespace: Optional[str], 1246 config: Config, 1247 message_repository: Optional[MessageRepository] = None, 1248 runtime_lookback_window: Optional[datetime.timedelta] = None, 1249 stream_state_migrations: Optional[List[Any]] = None, 1250 **kwargs: Any, 1251 ) -> ConcurrentCursor: 1252 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1253 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1254 # incoming state and connector_state_manager that is initialized when the component factory is created 1255 stream_state = ( 1256 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1257 if "stream_state" not in kwargs 1258 else kwargs["stream_state"] 1259 ) 1260 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1261 1262 component_type = component_definition.get("type") 1263 if component_definition.get("type") != model_type.__name__: 1264 raise ValueError( 1265 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1266 ) 1267 1268 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1269 1270 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1271 raise ValueError( 1272 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1273 ) 1274 1275 interpolated_cursor_field = InterpolatedString.create( 1276 datetime_based_cursor_model.cursor_field, 1277 parameters=datetime_based_cursor_model.parameters or {}, 1278 ) 1279 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1280 1281 interpolated_partition_field_start = InterpolatedString.create( 1282 datetime_based_cursor_model.partition_field_start or "start_time", 1283 parameters=datetime_based_cursor_model.parameters or {}, 1284 ) 1285 interpolated_partition_field_end = InterpolatedString.create( 1286 datetime_based_cursor_model.partition_field_end or "end_time", 1287 parameters=datetime_based_cursor_model.parameters or {}, 1288 ) 1289 1290 slice_boundary_fields = ( 1291 interpolated_partition_field_start.eval(config=config), 1292 interpolated_partition_field_end.eval(config=config), 1293 ) 1294 1295 datetime_format = datetime_based_cursor_model.datetime_format 1296 1297 cursor_granularity = ( 1298 parse_duration(datetime_based_cursor_model.cursor_granularity) 1299 if datetime_based_cursor_model.cursor_granularity 1300 else None 1301 ) 1302 1303 lookback_window = None 1304 interpolated_lookback_window = ( 1305 InterpolatedString.create( 1306 datetime_based_cursor_model.lookback_window, 1307 parameters=datetime_based_cursor_model.parameters or {}, 1308 ) 1309 if datetime_based_cursor_model.lookback_window 1310 else None 1311 ) 1312 if interpolated_lookback_window: 1313 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1314 if evaluated_lookback_window: 1315 lookback_window = parse_duration(evaluated_lookback_window) 1316 1317 connector_state_converter: DateTimeStreamStateConverter 1318 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1319 datetime_format=datetime_format, 1320 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1321 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1322 cursor_granularity=cursor_granularity, 1323 ) 1324 1325 # Adjusts the stream state by applying the runtime lookback window. 1326 # This is used to ensure correct state handling in case of failed partitions. 1327 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1328 if runtime_lookback_window and stream_state_value: 1329 new_stream_state = ( 1330 connector_state_converter.parse_timestamp(stream_state_value) 1331 - runtime_lookback_window 1332 ) 1333 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1334 new_stream_state 1335 ) 1336 1337 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1338 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1339 start_date_runtime_value = self.create_min_max_datetime( 1340 model=datetime_based_cursor_model.start_datetime, config=config 1341 ) 1342 else: 1343 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1344 1345 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1346 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1347 end_date_runtime_value = self.create_min_max_datetime( 1348 model=datetime_based_cursor_model.end_datetime, config=config 1349 ) 1350 else: 1351 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1352 1353 interpolated_start_date = MinMaxDatetime.create( 1354 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1355 parameters=datetime_based_cursor_model.parameters, 1356 ) 1357 interpolated_end_date = ( 1358 None 1359 if not end_date_runtime_value 1360 else MinMaxDatetime.create( 1361 end_date_runtime_value, datetime_based_cursor_model.parameters 1362 ) 1363 ) 1364 1365 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1366 if not interpolated_start_date.datetime_format: 1367 interpolated_start_date.datetime_format = datetime_format 1368 if interpolated_end_date and not interpolated_end_date.datetime_format: 1369 interpolated_end_date.datetime_format = datetime_format 1370 1371 start_date = interpolated_start_date.get_datetime(config=config) 1372 end_date_provider = ( 1373 partial(interpolated_end_date.get_datetime, config) 1374 if interpolated_end_date 1375 else connector_state_converter.get_end_provider() 1376 ) 1377 1378 if ( 1379 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1380 ) or ( 1381 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1382 ): 1383 raise ValueError( 1384 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1385 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1386 ) 1387 1388 # When step is not defined, default to a step size from the starting date to the present moment 1389 step_length = datetime.timedelta.max 1390 interpolated_step = ( 1391 InterpolatedString.create( 1392 datetime_based_cursor_model.step, 1393 parameters=datetime_based_cursor_model.parameters or {}, 1394 ) 1395 if datetime_based_cursor_model.step 1396 else None 1397 ) 1398 if interpolated_step: 1399 evaluated_step = interpolated_step.eval(config) 1400 if evaluated_step: 1401 step_length = parse_duration(evaluated_step) 1402 1403 clamping_strategy: ClampingStrategy = NoClamping() 1404 if datetime_based_cursor_model.clamping: 1405 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1406 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1407 # object which we want to keep agnostic of being low-code 1408 target = InterpolatedString( 1409 string=datetime_based_cursor_model.clamping.target, 1410 parameters=datetime_based_cursor_model.parameters or {}, 1411 ) 1412 evaluated_target = target.eval(config=config) 1413 match evaluated_target: 1414 case "DAY": 1415 clamping_strategy = DayClampingStrategy() 1416 end_date_provider = ClampingEndProvider( 1417 DayClampingStrategy(is_ceiling=False), 1418 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1419 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1420 ) 1421 case "WEEK": 1422 if ( 1423 not datetime_based_cursor_model.clamping.target_details 1424 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1425 ): 1426 raise ValueError( 1427 "Given WEEK clamping, weekday needs to be provided as target_details" 1428 ) 1429 weekday = self._assemble_weekday( 1430 datetime_based_cursor_model.clamping.target_details["weekday"] 1431 ) 1432 clamping_strategy = WeekClampingStrategy(weekday) 1433 end_date_provider = ClampingEndProvider( 1434 WeekClampingStrategy(weekday, is_ceiling=False), 1435 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1436 granularity=cursor_granularity or datetime.timedelta(days=1), 1437 ) 1438 case "MONTH": 1439 clamping_strategy = MonthClampingStrategy() 1440 end_date_provider = ClampingEndProvider( 1441 MonthClampingStrategy(is_ceiling=False), 1442 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1443 granularity=cursor_granularity or datetime.timedelta(days=1), 1444 ) 1445 case _: 1446 raise ValueError( 1447 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1448 ) 1449 1450 return ConcurrentCursor( 1451 stream_name=stream_name, 1452 stream_namespace=stream_namespace, 1453 stream_state=stream_state, 1454 message_repository=message_repository or self._message_repository, 1455 connector_state_manager=self._connector_state_manager, 1456 connector_state_converter=connector_state_converter, 1457 cursor_field=cursor_field, 1458 slice_boundary_fields=slice_boundary_fields, 1459 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1460 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1461 lookback_window=lookback_window, 1462 slice_range=step_length, 1463 cursor_granularity=cursor_granularity, 1464 clamping_strategy=clamping_strategy, 1465 )
1467 def create_concurrent_cursor_from_incrementing_count_cursor( 1468 self, 1469 model_type: Type[BaseModel], 1470 component_definition: ComponentDefinition, 1471 stream_name: str, 1472 stream_namespace: Optional[str], 1473 config: Config, 1474 message_repository: Optional[MessageRepository] = None, 1475 **kwargs: Any, 1476 ) -> ConcurrentCursor: 1477 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1478 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1479 # incoming state and connector_state_manager that is initialized when the component factory is created 1480 stream_state = ( 1481 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1482 if "stream_state" not in kwargs 1483 else kwargs["stream_state"] 1484 ) 1485 1486 component_type = component_definition.get("type") 1487 if component_definition.get("type") != model_type.__name__: 1488 raise ValueError( 1489 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1490 ) 1491 1492 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1493 1494 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1495 raise ValueError( 1496 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1497 ) 1498 1499 interpolated_start_value = ( 1500 InterpolatedString.create( 1501 incrementing_count_cursor_model.start_value, # type: ignore 1502 parameters=incrementing_count_cursor_model.parameters or {}, 1503 ) 1504 if incrementing_count_cursor_model.start_value 1505 else 0 1506 ) 1507 1508 interpolated_cursor_field = InterpolatedString.create( 1509 incrementing_count_cursor_model.cursor_field, 1510 parameters=incrementing_count_cursor_model.parameters or {}, 1511 ) 1512 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1513 1514 connector_state_converter = IncrementingCountStreamStateConverter( 1515 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1516 ) 1517 1518 return ConcurrentCursor( 1519 stream_name=stream_name, 1520 stream_namespace=stream_namespace, 1521 stream_state=stream_state, 1522 message_repository=message_repository or self._message_repository, 1523 connector_state_manager=self._connector_state_manager, 1524 connector_state_converter=connector_state_converter, 1525 cursor_field=cursor_field, 1526 slice_boundary_fields=None, 1527 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1528 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1529 )
1550 def create_concurrent_cursor_from_perpartition_cursor( 1551 self, 1552 state_manager: ConnectorStateManager, 1553 model_type: Type[BaseModel], 1554 component_definition: ComponentDefinition, 1555 stream_name: str, 1556 stream_namespace: Optional[str], 1557 config: Config, 1558 stream_state: MutableMapping[str, Any], 1559 partition_router: PartitionRouter, 1560 stream_state_migrations: Optional[List[Any]] = None, 1561 **kwargs: Any, 1562 ) -> ConcurrentPerPartitionCursor: 1563 component_type = component_definition.get("type") 1564 if component_definition.get("type") != model_type.__name__: 1565 raise ValueError( 1566 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1567 ) 1568 1569 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1570 1571 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1572 raise ValueError( 1573 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1574 ) 1575 1576 interpolated_cursor_field = InterpolatedString.create( 1577 datetime_based_cursor_model.cursor_field, 1578 parameters=datetime_based_cursor_model.parameters or {}, 1579 ) 1580 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1581 1582 datetime_format = datetime_based_cursor_model.datetime_format 1583 1584 cursor_granularity = ( 1585 parse_duration(datetime_based_cursor_model.cursor_granularity) 1586 if datetime_based_cursor_model.cursor_granularity 1587 else None 1588 ) 1589 1590 connector_state_converter: DateTimeStreamStateConverter 1591 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1592 datetime_format=datetime_format, 1593 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1594 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1595 cursor_granularity=cursor_granularity, 1596 ) 1597 1598 # Create the cursor factory 1599 cursor_factory = ConcurrentCursorFactory( 1600 partial( 1601 self.create_concurrent_cursor_from_datetime_based_cursor, 1602 state_manager=state_manager, 1603 model_type=model_type, 1604 component_definition=component_definition, 1605 stream_name=stream_name, 1606 stream_namespace=stream_namespace, 1607 config=config, 1608 message_repository=NoopMessageRepository(), 1609 stream_state_migrations=stream_state_migrations, 1610 ) 1611 ) 1612 1613 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1614 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1615 use_global_cursor = isinstance( 1616 partition_router, GroupingPartitionRouter 1617 ) or component_definition.get("global_substream_cursor", False) 1618 1619 # Return the concurrent cursor and state converter 1620 return ConcurrentPerPartitionCursor( 1621 cursor_factory=cursor_factory, 1622 partition_router=partition_router, 1623 stream_name=stream_name, 1624 stream_namespace=stream_namespace, 1625 stream_state=stream_state, 1626 message_repository=self._message_repository, # type: ignore 1627 connector_state_manager=state_manager, 1628 connector_state_converter=connector_state_converter, 1629 cursor_field=cursor_field, 1630 use_global_cursor=use_global_cursor, 1631 )
1633 @staticmethod 1634 def create_constant_backoff_strategy( 1635 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1636 ) -> ConstantBackoffStrategy: 1637 return ConstantBackoffStrategy( 1638 backoff_time_in_seconds=model.backoff_time_in_seconds, 1639 config=config, 1640 parameters=model.parameters or {}, 1641 )
1643 def create_cursor_pagination( 1644 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1645 ) -> CursorPaginationStrategy: 1646 if isinstance(decoder, PaginationDecoderDecorator): 1647 inner_decoder = decoder.decoder 1648 else: 1649 inner_decoder = decoder 1650 decoder = PaginationDecoderDecorator(decoder=decoder) 1651 1652 if self._is_supported_decoder_for_pagination(inner_decoder): 1653 decoder_to_use = decoder 1654 else: 1655 raise ValueError( 1656 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1657 ) 1658 1659 return CursorPaginationStrategy( 1660 cursor_value=model.cursor_value, 1661 decoder=decoder_to_use, 1662 page_size=model.page_size, 1663 stop_condition=model.stop_condition, 1664 config=config, 1665 parameters=model.parameters or {}, 1666 )
1668 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1669 """ 1670 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1671 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1672 :param model: The Pydantic model of the custom component being created 1673 :param config: The custom defined connector config 1674 :return: The declarative component built from the Pydantic model to be used at runtime 1675 """ 1676 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1677 component_fields = get_type_hints(custom_component_class) 1678 model_args = model.dict() 1679 model_args["config"] = config 1680 1681 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1682 # we defer to these arguments over the component's definition 1683 for key, arg in kwargs.items(): 1684 model_args[key] = arg 1685 1686 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1687 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1688 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1689 for model_field, model_value in model_args.items(): 1690 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1691 if ( 1692 isinstance(model_value, dict) 1693 and "type" not in model_value 1694 and model_field in component_fields 1695 ): 1696 derived_type = self._derive_component_type_from_type_hints( 1697 component_fields.get(model_field) 1698 ) 1699 if derived_type: 1700 model_value["type"] = derived_type 1701 1702 if self._is_component(model_value): 1703 model_args[model_field] = self._create_nested_component( 1704 model, model_field, model_value, config 1705 ) 1706 elif isinstance(model_value, list): 1707 vals = [] 1708 for v in model_value: 1709 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1710 derived_type = self._derive_component_type_from_type_hints( 1711 component_fields.get(model_field) 1712 ) 1713 if derived_type: 1714 v["type"] = derived_type 1715 if self._is_component(v): 1716 vals.append(self._create_nested_component(model, model_field, v, config)) 1717 else: 1718 vals.append(v) 1719 model_args[model_field] = vals 1720 1721 kwargs = { 1722 class_field: model_args[class_field] 1723 for class_field in component_fields.keys() 1724 if class_field in model_args 1725 } 1726 return custom_component_class(**kwargs)
Generically creates a custom component based on the model type and a class_name reference to the custom Python class being instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor
Parameters
- model: The Pydantic model of the custom component being created
- config: The custom defined connector config
Returns
The declarative component built from the Pydantic model to be used at runtime
1858 def create_datetime_based_cursor( 1859 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1860 ) -> DatetimeBasedCursor: 1861 start_datetime: Union[str, MinMaxDatetime] = ( 1862 model.start_datetime 1863 if isinstance(model.start_datetime, str) 1864 else self.create_min_max_datetime(model.start_datetime, config) 1865 ) 1866 end_datetime: Union[str, MinMaxDatetime, None] = None 1867 if model.is_data_feed and model.end_datetime: 1868 raise ValueError("Data feed does not support end_datetime") 1869 if model.is_data_feed and model.is_client_side_incremental: 1870 raise ValueError( 1871 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1872 ) 1873 if model.end_datetime: 1874 end_datetime = ( 1875 model.end_datetime 1876 if isinstance(model.end_datetime, str) 1877 else self.create_min_max_datetime(model.end_datetime, config) 1878 ) 1879 1880 end_time_option = ( 1881 self._create_component_from_model( 1882 model.end_time_option, config, parameters=model.parameters or {} 1883 ) 1884 if model.end_time_option 1885 else None 1886 ) 1887 start_time_option = ( 1888 self._create_component_from_model( 1889 model.start_time_option, config, parameters=model.parameters or {} 1890 ) 1891 if model.start_time_option 1892 else None 1893 ) 1894 1895 return DatetimeBasedCursor( 1896 cursor_field=model.cursor_field, 1897 cursor_datetime_formats=model.cursor_datetime_formats 1898 if model.cursor_datetime_formats 1899 else [], 1900 cursor_granularity=model.cursor_granularity, 1901 datetime_format=model.datetime_format, 1902 end_datetime=end_datetime, 1903 start_datetime=start_datetime, 1904 step=model.step, 1905 end_time_option=end_time_option, 1906 lookback_window=model.lookback_window, 1907 start_time_option=start_time_option, 1908 partition_field_end=model.partition_field_end, 1909 partition_field_start=model.partition_field_start, 1910 message_repository=self._message_repository, 1911 is_compare_strictly=model.is_compare_strictly, 1912 config=config, 1913 parameters=model.parameters or {}, 1914 )
1916 def create_declarative_stream( 1917 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1918 ) -> DeclarativeStream: 1919 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1920 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1921 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1922 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1923 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1924 1925 primary_key = model.primary_key.__root__ if model.primary_key else None 1926 stop_condition_on_cursor = ( 1927 model.incremental_sync 1928 and hasattr(model.incremental_sync, "is_data_feed") 1929 and model.incremental_sync.is_data_feed 1930 ) 1931 client_side_incremental_sync = None 1932 if ( 1933 model.incremental_sync 1934 and hasattr(model.incremental_sync, "is_client_side_incremental") 1935 and model.incremental_sync.is_client_side_incremental 1936 ): 1937 supported_slicers = ( 1938 DatetimeBasedCursor, 1939 GlobalSubstreamCursor, 1940 PerPartitionWithGlobalCursor, 1941 ) 1942 if combined_slicers and not isinstance(combined_slicers, supported_slicers): 1943 raise ValueError( 1944 "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead" 1945 ) 1946 cursor = ( 1947 combined_slicers 1948 if isinstance( 1949 combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor) 1950 ) 1951 else self._create_component_from_model(model=model.incremental_sync, config=config) 1952 ) 1953 1954 client_side_incremental_sync = {"cursor": cursor} 1955 1956 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1957 cursor_model = model.incremental_sync 1958 1959 end_time_option = ( 1960 self._create_component_from_model( 1961 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1962 ) 1963 if cursor_model.end_time_option 1964 else None 1965 ) 1966 start_time_option = ( 1967 self._create_component_from_model( 1968 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1969 ) 1970 if cursor_model.start_time_option 1971 else None 1972 ) 1973 1974 request_options_provider = DatetimeBasedRequestOptionsProvider( 1975 start_time_option=start_time_option, 1976 end_time_option=end_time_option, 1977 partition_field_start=cursor_model.partition_field_end, 1978 partition_field_end=cursor_model.partition_field_end, 1979 config=config, 1980 parameters=model.parameters or {}, 1981 ) 1982 elif model.incremental_sync and isinstance( 1983 model.incremental_sync, IncrementingCountCursorModel 1984 ): 1985 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1986 1987 start_time_option = ( 1988 self._create_component_from_model( 1989 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1990 config, 1991 parameters=cursor_model.parameters or {}, 1992 ) 1993 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1994 else None 1995 ) 1996 1997 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1998 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1999 partition_field_start = "start" 2000 2001 request_options_provider = DatetimeBasedRequestOptionsProvider( 2002 start_time_option=start_time_option, 2003 partition_field_start=partition_field_start, 2004 config=config, 2005 parameters=model.parameters or {}, 2006 ) 2007 else: 2008 request_options_provider = None 2009 2010 transformations = [] 2011 if model.transformations: 2012 for transformation_model in model.transformations: 2013 transformations.append( 2014 self._create_component_from_model(model=transformation_model, config=config) 2015 ) 2016 file_uploader = None 2017 if model.file_uploader: 2018 file_uploader = self._create_component_from_model( 2019 model=model.file_uploader, config=config 2020 ) 2021 2022 retriever = self._create_component_from_model( 2023 model=model.retriever, 2024 config=config, 2025 name=model.name, 2026 primary_key=primary_key, 2027 stream_slicer=combined_slicers, 2028 request_options_provider=request_options_provider, 2029 stop_condition_on_cursor=stop_condition_on_cursor, 2030 client_side_incremental_sync=client_side_incremental_sync, 2031 transformations=transformations, 2032 file_uploader=file_uploader, 2033 incremental_sync=model.incremental_sync, 2034 ) 2035 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 2036 2037 if model.state_migrations: 2038 state_transformations = [ 2039 self._create_component_from_model(state_migration, config, declarative_stream=model) 2040 for state_migration in model.state_migrations 2041 ] 2042 else: 2043 state_transformations = [] 2044 2045 schema_loader: Union[ 2046 CompositeSchemaLoader, 2047 DefaultSchemaLoader, 2048 DynamicSchemaLoader, 2049 InlineSchemaLoader, 2050 JsonFileSchemaLoader, 2051 ] 2052 if model.schema_loader and isinstance(model.schema_loader, list): 2053 nested_schema_loaders = [ 2054 self._create_component_from_model(model=nested_schema_loader, config=config) 2055 for nested_schema_loader in model.schema_loader 2056 ] 2057 schema_loader = CompositeSchemaLoader( 2058 schema_loaders=nested_schema_loaders, parameters={} 2059 ) 2060 elif model.schema_loader: 2061 schema_loader = self._create_component_from_model( 2062 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2063 config=config, 2064 ) 2065 else: 2066 options = model.parameters or {} 2067 if "name" not in options: 2068 options["name"] = model.name 2069 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2070 2071 return DeclarativeStream( 2072 name=model.name or "", 2073 primary_key=primary_key, 2074 retriever=retriever, 2075 schema_loader=schema_loader, 2076 stream_cursor_field=cursor_field or "", 2077 state_migrations=state_transformations, 2078 config=config, 2079 parameters=model.parameters or {}, 2080 )
2249 def create_default_error_handler( 2250 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2251 ) -> DefaultErrorHandler: 2252 backoff_strategies = [] 2253 if model.backoff_strategies: 2254 for backoff_strategy_model in model.backoff_strategies: 2255 backoff_strategies.append( 2256 self._create_component_from_model(model=backoff_strategy_model, config=config) 2257 ) 2258 2259 response_filters = [] 2260 if model.response_filters: 2261 for response_filter_model in model.response_filters: 2262 response_filters.append( 2263 self._create_component_from_model(model=response_filter_model, config=config) 2264 ) 2265 response_filters.append( 2266 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2267 ) 2268 2269 return DefaultErrorHandler( 2270 backoff_strategies=backoff_strategies, 2271 max_retries=model.max_retries, 2272 response_filters=response_filters, 2273 config=config, 2274 parameters=model.parameters or {}, 2275 )
2277 def create_default_paginator( 2278 self, 2279 model: DefaultPaginatorModel, 2280 config: Config, 2281 *, 2282 url_base: str, 2283 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2284 decoder: Optional[Decoder] = None, 2285 cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None, 2286 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2287 if decoder: 2288 if self._is_supported_decoder_for_pagination(decoder): 2289 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2290 else: 2291 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2292 else: 2293 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2294 page_size_option = ( 2295 self._create_component_from_model(model=model.page_size_option, config=config) 2296 if model.page_size_option 2297 else None 2298 ) 2299 page_token_option = ( 2300 self._create_component_from_model(model=model.page_token_option, config=config) 2301 if model.page_token_option 2302 else None 2303 ) 2304 pagination_strategy = self._create_component_from_model( 2305 model=model.pagination_strategy, 2306 config=config, 2307 decoder=decoder_to_use, 2308 extractor_model=extractor_model, 2309 ) 2310 if cursor_used_for_stop_condition: 2311 pagination_strategy = StopConditionPaginationStrategyDecorator( 2312 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2313 ) 2314 paginator = DefaultPaginator( 2315 decoder=decoder_to_use, 2316 page_size_option=page_size_option, 2317 page_token_option=page_token_option, 2318 pagination_strategy=pagination_strategy, 2319 url_base=url_base, 2320 config=config, 2321 parameters=model.parameters or {}, 2322 ) 2323 if self._limit_pages_fetched_per_slice: 2324 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2325 return paginator
2327 def create_dpath_extractor( 2328 self, 2329 model: DpathExtractorModel, 2330 config: Config, 2331 decoder: Optional[Decoder] = None, 2332 **kwargs: Any, 2333 ) -> DpathExtractor: 2334 if decoder: 2335 decoder_to_use = decoder 2336 else: 2337 decoder_to_use = JsonDecoder(parameters={}) 2338 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2339 return DpathExtractor( 2340 decoder=decoder_to_use, 2341 field_path=model_field_path, 2342 config=config, 2343 parameters=model.parameters or {}, 2344 )
2365 def create_http_requester( 2366 self, 2367 model: HttpRequesterModel, 2368 config: Config, 2369 decoder: Decoder = JsonDecoder(parameters={}), 2370 query_properties_key: Optional[str] = None, 2371 use_cache: Optional[bool] = None, 2372 *, 2373 name: str, 2374 ) -> HttpRequester: 2375 authenticator = ( 2376 self._create_component_from_model( 2377 model=model.authenticator, 2378 config=config, 2379 url_base=model.url or model.url_base, 2380 name=name, 2381 decoder=decoder, 2382 ) 2383 if model.authenticator 2384 else None 2385 ) 2386 error_handler = ( 2387 self._create_component_from_model(model=model.error_handler, config=config) 2388 if model.error_handler 2389 else DefaultErrorHandler( 2390 backoff_strategies=[], 2391 response_filters=[], 2392 config=config, 2393 parameters=model.parameters or {}, 2394 ) 2395 ) 2396 2397 api_budget = self._api_budget 2398 2399 # Removes QueryProperties components from the interpolated mappings because it has been designed 2400 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2401 # instead of through jinja interpolation 2402 request_parameters: Optional[Union[str, Mapping[str, str]]] 2403 if isinstance(model.request_parameters, Mapping): 2404 request_parameters = self._remove_query_properties(model.request_parameters) 2405 else: 2406 request_parameters = model.request_parameters 2407 2408 request_options_provider = InterpolatedRequestOptionsProvider( 2409 request_body=model.request_body, 2410 request_body_data=model.request_body_data, 2411 request_body_json=model.request_body_json, 2412 request_headers=model.request_headers, 2413 request_parameters=request_parameters, 2414 query_properties_key=query_properties_key, 2415 config=config, 2416 parameters=model.parameters or {}, 2417 ) 2418 2419 assert model.use_cache is not None # for mypy 2420 assert model.http_method is not None # for mypy 2421 2422 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2423 2424 return HttpRequester( 2425 name=name, 2426 url=model.url, 2427 url_base=model.url_base, 2428 path=model.path, 2429 authenticator=authenticator, 2430 error_handler=error_handler, 2431 api_budget=api_budget, 2432 http_method=HttpMethod[model.http_method.value], 2433 request_options_provider=request_options_provider, 2434 config=config, 2435 disable_retries=self._disable_retries, 2436 parameters=model.parameters or {}, 2437 message_repository=self._message_repository, 2438 use_cache=should_use_cache, 2439 decoder=decoder, 2440 stream_response=decoder.is_stream_response() if decoder else False, 2441 )
2443 @staticmethod 2444 def create_http_response_filter( 2445 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2446 ) -> HttpResponseFilter: 2447 if model.action: 2448 action = ResponseAction(model.action.value) 2449 else: 2450 action = None 2451 2452 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2453 2454 http_codes = ( 2455 set(model.http_codes) if model.http_codes else set() 2456 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2457 2458 return HttpResponseFilter( 2459 action=action, 2460 failure_type=failure_type, 2461 error_message=model.error_message or "", 2462 error_message_contains=model.error_message_contains or "", 2463 http_codes=http_codes, 2464 predicate=model.predicate or "", 2465 config=config, 2466 parameters=model.parameters or {}, 2467 )
2475 def create_complex_field_type( 2476 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2477 ) -> ComplexFieldType: 2478 items = ( 2479 self._create_component_from_model(model=model.items, config=config) 2480 if isinstance(model.items, ComplexFieldTypeModel) 2481 else model.items 2482 ) 2483 2484 return ComplexFieldType(field_type=model.field_type, items=items)
2486 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2487 target_type = ( 2488 self._create_component_from_model(model=model.target_type, config=config) 2489 if isinstance(model.target_type, ComplexFieldTypeModel) 2490 else model.target_type 2491 ) 2492 2493 return TypesMap( 2494 target_type=target_type, 2495 current_type=model.current_type, 2496 condition=model.condition if model.condition is not None else "True", 2497 )
2499 def create_schema_type_identifier( 2500 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2501 ) -> SchemaTypeIdentifier: 2502 types_mapping = [] 2503 if model.types_mapping: 2504 types_mapping.extend( 2505 [ 2506 self._create_component_from_model(types_map, config=config) 2507 for types_map in model.types_mapping 2508 ] 2509 ) 2510 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2511 [x for x in model.schema_pointer] if model.schema_pointer else [] 2512 ) 2513 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2514 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2515 [x for x in model.type_pointer] if model.type_pointer else None 2516 ) 2517 2518 return SchemaTypeIdentifier( 2519 schema_pointer=model_schema_pointer, 2520 key_pointer=model_key_pointer, 2521 type_pointer=model_type_pointer, 2522 types_mapping=types_mapping, 2523 parameters=model.parameters or {}, 2524 )
2526 def create_dynamic_schema_loader( 2527 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2528 ) -> DynamicSchemaLoader: 2529 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2530 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2531 2532 schema_transformations = [] 2533 if model.schema_transformations: 2534 for transformation_model in model.schema_transformations: 2535 schema_transformations.append( 2536 self._create_component_from_model(model=transformation_model, config=config) 2537 ) 2538 name = "dynamic_properties" 2539 retriever = self._create_component_from_model( 2540 model=model.retriever, 2541 config=config, 2542 name=name, 2543 primary_key=None, 2544 stream_slicer=combined_slicers, 2545 transformations=[], 2546 use_cache=True, 2547 log_formatter=( 2548 lambda response: format_http_message( 2549 response, 2550 f"Schema loader '{name}' request", 2551 f"Request performed in order to extract schema.", 2552 name, 2553 is_auxiliary=True, 2554 ) 2555 ), 2556 ) 2557 schema_type_identifier = self._create_component_from_model( 2558 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2559 ) 2560 schema_filter = ( 2561 self._create_component_from_model( 2562 model.schema_filter, config=config, parameters=model.parameters or {} 2563 ) 2564 if model.schema_filter is not None 2565 else None 2566 ) 2567 2568 return DynamicSchemaLoader( 2569 retriever=retriever, 2570 config=config, 2571 schema_transformations=schema_transformations, 2572 schema_filter=schema_filter, 2573 schema_type_identifier=schema_type_identifier, 2574 parameters=model.parameters or {}, 2575 )
2595 def create_gzip_decoder( 2596 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2597 ) -> Decoder: 2598 _compressed_response_types = { 2599 "gzip", 2600 "x-gzip", 2601 "gzip, deflate", 2602 "x-gzip, deflate", 2603 "application/zip", 2604 "application/gzip", 2605 "application/x-gzip", 2606 "application/x-zip-compressed", 2607 } 2608 2609 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2610 2611 if self._emit_connector_builder_messages: 2612 # This is very surprising but if the response is not streamed, 2613 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2614 # which uses urllib3 directly and does not uncompress the data. 2615 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2616 2617 return CompositeRawDecoder.by_headers( 2618 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2619 stream_response=True, 2620 fallback_parser=gzip_parser.inner_parser, 2621 )
2623 @staticmethod 2624 def create_incrementing_count_cursor( 2625 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2626 ) -> DatetimeBasedCursor: 2627 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2628 # we still parse models into components. The issue is that there's no runtime implementation of a 2629 # IncrementingCountCursor. 2630 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2631 return DatetimeBasedCursor( 2632 cursor_field=model.cursor_field, 2633 datetime_format="%Y-%m-%d", 2634 start_datetime="2024-12-12", 2635 config=config, 2636 parameters={}, 2637 )
2686 @staticmethod 2687 def create_jwt_authenticator( 2688 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2689 ) -> JwtAuthenticator: 2690 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2691 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2692 return JwtAuthenticator( 2693 config=config, 2694 parameters=model.parameters or {}, 2695 algorithm=JwtAlgorithm(model.algorithm.value), 2696 secret_key=model.secret_key, 2697 base64_encode_secret_key=model.base64_encode_secret_key, 2698 token_duration=model.token_duration, 2699 header_prefix=model.header_prefix, 2700 kid=jwt_headers.kid, 2701 typ=jwt_headers.typ, 2702 cty=jwt_headers.cty, 2703 iss=jwt_payload.iss, 2704 sub=jwt_payload.sub, 2705 aud=jwt_payload.aud, 2706 additional_jwt_headers=model.additional_jwt_headers, 2707 additional_jwt_payload=model.additional_jwt_payload, 2708 )
2710 def create_list_partition_router( 2711 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2712 ) -> ListPartitionRouter: 2713 request_option = ( 2714 self._create_component_from_model(model.request_option, config) 2715 if model.request_option 2716 else None 2717 ) 2718 return ListPartitionRouter( 2719 cursor_field=model.cursor_field, 2720 request_option=request_option, 2721 values=model.values, 2722 config=config, 2723 parameters=model.parameters or {}, 2724 )
2726 @staticmethod 2727 def create_min_max_datetime( 2728 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2729 ) -> MinMaxDatetime: 2730 return MinMaxDatetime( 2731 datetime=model.datetime, 2732 datetime_format=model.datetime_format or "", 2733 max_datetime=model.max_datetime or "", 2734 min_datetime=model.min_datetime or "", 2735 parameters=model.parameters or {}, 2736 )
2748 def create_oauth_authenticator( 2749 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2750 ) -> DeclarativeOauth2Authenticator: 2751 profile_assertion = ( 2752 self._create_component_from_model(model.profile_assertion, config=config) 2753 if model.profile_assertion 2754 else None 2755 ) 2756 2757 if model.refresh_token_updater: 2758 # ignore type error because fixing it would have a lot of dependencies, revisit later 2759 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2760 config, 2761 InterpolatedString.create( 2762 model.token_refresh_endpoint, # type: ignore 2763 parameters=model.parameters or {}, 2764 ).eval(config), 2765 access_token_name=InterpolatedString.create( 2766 model.access_token_name or "access_token", parameters=model.parameters or {} 2767 ).eval(config), 2768 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2769 expires_in_name=InterpolatedString.create( 2770 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2771 ).eval(config), 2772 client_id_name=InterpolatedString.create( 2773 model.client_id_name or "client_id", parameters=model.parameters or {} 2774 ).eval(config), 2775 client_id=InterpolatedString.create( 2776 model.client_id, parameters=model.parameters or {} 2777 ).eval(config) 2778 if model.client_id 2779 else model.client_id, 2780 client_secret_name=InterpolatedString.create( 2781 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2782 ).eval(config), 2783 client_secret=InterpolatedString.create( 2784 model.client_secret, parameters=model.parameters or {} 2785 ).eval(config) 2786 if model.client_secret 2787 else model.client_secret, 2788 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2789 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2790 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2791 grant_type_name=InterpolatedString.create( 2792 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2793 ).eval(config), 2794 grant_type=InterpolatedString.create( 2795 model.grant_type or "refresh_token", parameters=model.parameters or {} 2796 ).eval(config), 2797 refresh_request_body=InterpolatedMapping( 2798 model.refresh_request_body or {}, parameters=model.parameters or {} 2799 ).eval(config), 2800 refresh_request_headers=InterpolatedMapping( 2801 model.refresh_request_headers or {}, parameters=model.parameters or {} 2802 ).eval(config), 2803 scopes=model.scopes, 2804 token_expiry_date_format=model.token_expiry_date_format, 2805 message_repository=self._message_repository, 2806 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2807 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2808 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2809 ) 2810 # ignore type error because fixing it would have a lot of dependencies, revisit later 2811 return DeclarativeOauth2Authenticator( # type: ignore 2812 access_token_name=model.access_token_name or "access_token", 2813 access_token_value=model.access_token_value, 2814 client_id_name=model.client_id_name or "client_id", 2815 client_id=model.client_id, 2816 client_secret_name=model.client_secret_name or "client_secret", 2817 client_secret=model.client_secret, 2818 expires_in_name=model.expires_in_name or "expires_in", 2819 grant_type_name=model.grant_type_name or "grant_type", 2820 grant_type=model.grant_type or "refresh_token", 2821 refresh_request_body=model.refresh_request_body, 2822 refresh_request_headers=model.refresh_request_headers, 2823 refresh_token_name=model.refresh_token_name or "refresh_token", 2824 refresh_token=model.refresh_token, 2825 scopes=model.scopes, 2826 token_expiry_date=model.token_expiry_date, 2827 token_expiry_date_format=model.token_expiry_date_format, 2828 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2829 token_refresh_endpoint=model.token_refresh_endpoint, 2830 config=config, 2831 parameters=model.parameters or {}, 2832 message_repository=self._message_repository, 2833 profile_assertion=profile_assertion, 2834 use_profile_assertion=model.use_profile_assertion, 2835 )
2837 def create_offset_increment( 2838 self, 2839 model: OffsetIncrementModel, 2840 config: Config, 2841 decoder: Decoder, 2842 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2843 **kwargs: Any, 2844 ) -> OffsetIncrement: 2845 if isinstance(decoder, PaginationDecoderDecorator): 2846 inner_decoder = decoder.decoder 2847 else: 2848 inner_decoder = decoder 2849 decoder = PaginationDecoderDecorator(decoder=decoder) 2850 2851 if self._is_supported_decoder_for_pagination(inner_decoder): 2852 decoder_to_use = decoder 2853 else: 2854 raise ValueError( 2855 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2856 ) 2857 2858 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2859 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2860 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2861 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2862 # When we have more time to investigate we can look into reusing the same component. 2863 extractor = ( 2864 self._create_component_from_model( 2865 model=extractor_model, config=config, decoder=decoder_to_use 2866 ) 2867 if extractor_model 2868 else None 2869 ) 2870 2871 return OffsetIncrement( 2872 page_size=model.page_size, 2873 config=config, 2874 decoder=decoder_to_use, 2875 extractor=extractor, 2876 inject_on_first_request=model.inject_on_first_request or False, 2877 parameters=model.parameters or {}, 2878 )
2880 @staticmethod 2881 def create_page_increment( 2882 model: PageIncrementModel, config: Config, **kwargs: Any 2883 ) -> PageIncrement: 2884 return PageIncrement( 2885 page_size=model.page_size, 2886 config=config, 2887 start_from_page=model.start_from_page or 0, 2888 inject_on_first_request=model.inject_on_first_request or False, 2889 parameters=model.parameters or {}, 2890 )
2892 def create_parent_stream_config( 2893 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2894 ) -> ParentStreamConfig: 2895 declarative_stream = self._create_component_from_model( 2896 model.stream, config=config, **kwargs 2897 ) 2898 request_option = ( 2899 self._create_component_from_model(model.request_option, config=config) 2900 if model.request_option 2901 else None 2902 ) 2903 2904 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2905 raise ValueError( 2906 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2907 ) 2908 2909 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2910 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2911 ) 2912 2913 return ParentStreamConfig( 2914 parent_key=model.parent_key, 2915 request_option=request_option, 2916 stream=declarative_stream, 2917 partition_field=model.partition_field, 2918 config=config, 2919 incremental_dependency=model.incremental_dependency or False, 2920 parameters=model.parameters or {}, 2921 extra_fields=model.extra_fields, 2922 lazy_read_pointer=model_lazy_read_pointer, 2923 )
2925 def create_properties_from_endpoint( 2926 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2927 ) -> PropertiesFromEndpoint: 2928 retriever = self._create_component_from_model( 2929 model=model.retriever, 2930 config=config, 2931 name="dynamic_properties", 2932 primary_key=None, 2933 stream_slicer=None, 2934 transformations=[], 2935 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2936 ) 2937 return PropertiesFromEndpoint( 2938 property_field_path=model.property_field_path, 2939 retriever=retriever, 2940 config=config, 2941 parameters=model.parameters or {}, 2942 )
2944 def create_property_chunking( 2945 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2946 ) -> PropertyChunking: 2947 record_merge_strategy = ( 2948 self._create_component_from_model( 2949 model=model.record_merge_strategy, config=config, **kwargs 2950 ) 2951 if model.record_merge_strategy 2952 else None 2953 ) 2954 2955 property_limit_type: PropertyLimitType 2956 match model.property_limit_type: 2957 case PropertyLimitTypeModel.property_count: 2958 property_limit_type = PropertyLimitType.property_count 2959 case PropertyLimitTypeModel.characters: 2960 property_limit_type = PropertyLimitType.characters 2961 case _: 2962 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2963 2964 return PropertyChunking( 2965 property_limit_type=property_limit_type, 2966 property_limit=model.property_limit, 2967 record_merge_strategy=record_merge_strategy, 2968 config=config, 2969 parameters=model.parameters or {}, 2970 )
2972 def create_query_properties( 2973 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2974 ) -> QueryProperties: 2975 if isinstance(model.property_list, list): 2976 property_list = model.property_list 2977 else: 2978 property_list = self._create_component_from_model( 2979 model=model.property_list, config=config, **kwargs 2980 ) 2981 2982 property_chunking = ( 2983 self._create_component_from_model( 2984 model=model.property_chunking, config=config, **kwargs 2985 ) 2986 if model.property_chunking 2987 else None 2988 ) 2989 2990 return QueryProperties( 2991 property_list=property_list, 2992 always_include_properties=model.always_include_properties, 2993 property_chunking=property_chunking, 2994 config=config, 2995 parameters=model.parameters or {}, 2996 )
3010 @staticmethod 3011 def create_request_option( 3012 model: RequestOptionModel, config: Config, **kwargs: Any 3013 ) -> RequestOption: 3014 inject_into = RequestOptionType(model.inject_into.value) 3015 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3016 [ 3017 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3018 for segment in model.field_path 3019 ] 3020 if model.field_path 3021 else None 3022 ) 3023 field_name = ( 3024 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3025 if model.field_name 3026 else None 3027 ) 3028 return RequestOption( 3029 field_name=field_name, 3030 field_path=field_path, 3031 inject_into=inject_into, 3032 parameters=kwargs.get("parameters", {}), 3033 )
3035 def create_record_selector( 3036 self, 3037 model: RecordSelectorModel, 3038 config: Config, 3039 *, 3040 name: str, 3041 transformations: List[RecordTransformation] | None = None, 3042 decoder: Decoder | None = None, 3043 client_side_incremental_sync: Dict[str, Any] | None = None, 3044 file_uploader: Optional[DefaultFileUploader] = None, 3045 **kwargs: Any, 3046 ) -> RecordSelector: 3047 extractor = self._create_component_from_model( 3048 model=model.extractor, decoder=decoder, config=config 3049 ) 3050 record_filter = ( 3051 self._create_component_from_model(model.record_filter, config=config) 3052 if model.record_filter 3053 else None 3054 ) 3055 3056 transform_before_filtering = ( 3057 False if model.transform_before_filtering is None else model.transform_before_filtering 3058 ) 3059 if client_side_incremental_sync: 3060 record_filter = ClientSideIncrementalRecordFilterDecorator( 3061 config=config, 3062 parameters=model.parameters, 3063 condition=model.record_filter.condition 3064 if (model.record_filter and hasattr(model.record_filter, "condition")) 3065 else None, 3066 **client_side_incremental_sync, 3067 ) 3068 transform_before_filtering = ( 3069 True 3070 if model.transform_before_filtering is None 3071 else model.transform_before_filtering 3072 ) 3073 3074 if model.schema_normalization is None: 3075 # default to no schema normalization if not set 3076 model.schema_normalization = SchemaNormalizationModel.None_ 3077 3078 schema_normalization = ( 3079 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3080 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3081 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3082 ) 3083 3084 return RecordSelector( 3085 extractor=extractor, 3086 name=name, 3087 config=config, 3088 record_filter=record_filter, 3089 transformations=transformations or [], 3090 file_uploader=file_uploader, 3091 schema_normalization=schema_normalization, 3092 parameters=model.parameters or {}, 3093 transform_before_filtering=transform_before_filtering, 3094 )
3104 def create_selective_authenticator( 3105 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3106 ) -> DeclarativeAuthenticator: 3107 authenticators = { 3108 name: self._create_component_from_model(model=auth, config=config) 3109 for name, auth in model.authenticators.items() 3110 } 3111 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3112 return SelectiveAuthenticator( # type: ignore[abstract] 3113 config=config, 3114 authenticators=authenticators, 3115 authenticator_selection_path=model.authenticator_selection_path, 3116 **kwargs, 3117 )
3119 @staticmethod 3120 def create_legacy_session_token_authenticator( 3121 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3122 ) -> LegacySessionTokenAuthenticator: 3123 return LegacySessionTokenAuthenticator( 3124 api_url=url_base, 3125 header=model.header, 3126 login_url=model.login_url, 3127 password=model.password or "", 3128 session_token=model.session_token or "", 3129 session_token_response_key=model.session_token_response_key or "", 3130 username=model.username or "", 3131 validate_session_url=model.validate_session_url, 3132 config=config, 3133 parameters=model.parameters or {}, 3134 )
3136 def create_simple_retriever( 3137 self, 3138 model: SimpleRetrieverModel, 3139 config: Config, 3140 *, 3141 name: str, 3142 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3143 stream_slicer: Optional[StreamSlicer], 3144 request_options_provider: Optional[RequestOptionsProvider] = None, 3145 stop_condition_on_cursor: bool = False, 3146 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3147 transformations: List[RecordTransformation], 3148 file_uploader: Optional[DefaultFileUploader] = None, 3149 incremental_sync: Optional[ 3150 Union[ 3151 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 3152 ] 3153 ] = None, 3154 use_cache: Optional[bool] = None, 3155 log_formatter: Optional[Callable[[Response], Any]] = None, 3156 **kwargs: Any, 3157 ) -> SimpleRetriever: 3158 def _get_url() -> str: 3159 """ 3160 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3161 This is needed because the URL is not set until the requester is created. 3162 """ 3163 3164 _url: str = ( 3165 model.requester.url 3166 if hasattr(model.requester, "url") and model.requester.url is not None 3167 else requester.get_url() 3168 ) 3169 _url_base: str = ( 3170 model.requester.url_base 3171 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3172 else requester.get_url_base() 3173 ) 3174 3175 return _url or _url_base 3176 3177 decoder = ( 3178 self._create_component_from_model(model=model.decoder, config=config) 3179 if model.decoder 3180 else JsonDecoder(parameters={}) 3181 ) 3182 record_selector = self._create_component_from_model( 3183 model=model.record_selector, 3184 name=name, 3185 config=config, 3186 decoder=decoder, 3187 transformations=transformations, 3188 client_side_incremental_sync=client_side_incremental_sync, 3189 file_uploader=file_uploader, 3190 ) 3191 3192 query_properties: Optional[QueryProperties] = None 3193 query_properties_key: Optional[str] = None 3194 if self._query_properties_in_request_parameters(model.requester): 3195 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3196 # places instead of default to request_parameters which isn't clearly documented 3197 if ( 3198 hasattr(model.requester, "fetch_properties_from_endpoint") 3199 and model.requester.fetch_properties_from_endpoint 3200 ): 3201 raise ValueError( 3202 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3203 ) 3204 3205 query_properties_definitions = [] 3206 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3207 if isinstance(request_parameter, QueryPropertiesModel): 3208 query_properties_key = key 3209 query_properties_definitions.append(request_parameter) 3210 3211 if len(query_properties_definitions) > 1: 3212 raise ValueError( 3213 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3214 ) 3215 3216 if len(query_properties_definitions) == 1: 3217 query_properties = self._create_component_from_model( 3218 model=query_properties_definitions[0], config=config 3219 ) 3220 elif ( 3221 hasattr(model.requester, "fetch_properties_from_endpoint") 3222 and model.requester.fetch_properties_from_endpoint 3223 ): 3224 query_properties_definition = QueryPropertiesModel( 3225 type="QueryProperties", 3226 property_list=model.requester.fetch_properties_from_endpoint, 3227 always_include_properties=None, 3228 property_chunking=None, 3229 ) # type: ignore # $parameters has a default value 3230 3231 query_properties = self.create_query_properties( 3232 model=query_properties_definition, 3233 config=config, 3234 ) 3235 3236 requester = self._create_component_from_model( 3237 model=model.requester, 3238 decoder=decoder, 3239 name=name, 3240 query_properties_key=query_properties_key, 3241 use_cache=use_cache, 3242 config=config, 3243 ) 3244 3245 # Define cursor only if per partition or common incremental support is needed 3246 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3247 3248 if ( 3249 not isinstance(stream_slicer, DatetimeBasedCursor) 3250 or type(stream_slicer) is not DatetimeBasedCursor 3251 ): 3252 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3253 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3254 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3255 # request_options_provider 3256 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3257 elif not request_options_provider: 3258 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3259 3260 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3261 if self._should_limit_slices_fetched(): 3262 stream_slicer = cast( 3263 StreamSlicer, 3264 StreamSlicerTestReadDecorator( 3265 wrapped_slicer=stream_slicer, 3266 maximum_number_of_slices=self._limit_slices_fetched or 5, 3267 ), 3268 ) 3269 3270 cursor_used_for_stop_condition = cursor if stop_condition_on_cursor else None 3271 paginator = ( 3272 self._create_component_from_model( 3273 model=model.paginator, 3274 config=config, 3275 url_base=_get_url(), 3276 extractor_model=model.record_selector.extractor, 3277 decoder=decoder, 3278 cursor_used_for_stop_condition=cursor_used_for_stop_condition, 3279 ) 3280 if model.paginator 3281 else NoPagination(parameters={}) 3282 ) 3283 3284 ignore_stream_slicer_parameters_on_paginated_requests = ( 3285 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3286 ) 3287 3288 if ( 3289 model.partition_router 3290 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3291 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3292 and any( 3293 parent_stream_config.lazy_read_pointer 3294 for parent_stream_config in model.partition_router.parent_stream_configs 3295 ) 3296 ): 3297 if incremental_sync: 3298 if incremental_sync.type != "DatetimeBasedCursor": 3299 raise ValueError( 3300 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3301 ) 3302 3303 elif incremental_sync.step or incremental_sync.cursor_granularity: 3304 raise ValueError( 3305 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3306 ) 3307 3308 if model.decoder and model.decoder.type != "JsonDecoder": 3309 raise ValueError( 3310 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3311 ) 3312 3313 return LazySimpleRetriever( 3314 name=name, 3315 paginator=paginator, 3316 primary_key=primary_key, 3317 requester=requester, 3318 record_selector=record_selector, 3319 stream_slicer=stream_slicer, 3320 request_option_provider=request_options_provider, 3321 cursor=cursor, 3322 config=config, 3323 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3324 parameters=model.parameters or {}, 3325 ) 3326 3327 return SimpleRetriever( 3328 name=name, 3329 paginator=paginator, 3330 primary_key=primary_key, 3331 requester=requester, 3332 record_selector=record_selector, 3333 stream_slicer=stream_slicer, 3334 request_option_provider=request_options_provider, 3335 cursor=cursor, 3336 config=config, 3337 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3338 additional_query_properties=query_properties, 3339 log_formatter=self._get_log_formatter(log_formatter, name), 3340 parameters=model.parameters or {}, 3341 )
3391 def create_state_delegating_stream( 3392 self, 3393 model: StateDelegatingStreamModel, 3394 config: Config, 3395 has_parent_state: Optional[bool] = None, 3396 **kwargs: Any, 3397 ) -> DeclarativeStream: 3398 if ( 3399 model.full_refresh_stream.name != model.name 3400 or model.name != model.incremental_stream.name 3401 ): 3402 raise ValueError( 3403 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3404 ) 3405 3406 stream_model = ( 3407 model.incremental_stream 3408 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3409 else model.full_refresh_stream 3410 ) 3411 3412 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description
3444 def create_async_retriever( 3445 self, 3446 model: AsyncRetrieverModel, 3447 config: Config, 3448 *, 3449 name: str, 3450 primary_key: Optional[ 3451 Union[str, List[str], List[List[str]]] 3452 ], # this seems to be needed to match create_simple_retriever 3453 stream_slicer: Optional[StreamSlicer], 3454 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3455 transformations: List[RecordTransformation], 3456 **kwargs: Any, 3457 ) -> AsyncRetriever: 3458 def _get_download_retriever() -> SimpleRetriever: 3459 # We create a record selector for the download retriever 3460 # with no schema normalization and no transformations, neither record filter 3461 # as all this occurs in the record_selector of the AsyncRetriever 3462 record_selector = RecordSelector( 3463 extractor=download_extractor, 3464 name=name, 3465 record_filter=None, 3466 transformations=[], 3467 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3468 config=config, 3469 parameters={}, 3470 ) 3471 paginator = ( 3472 self._create_component_from_model( 3473 model=model.download_paginator, 3474 decoder=decoder, 3475 config=config, 3476 url_base="", 3477 ) 3478 if model.download_paginator 3479 else NoPagination(parameters={}) 3480 ) 3481 3482 return SimpleRetriever( 3483 requester=download_requester, 3484 record_selector=record_selector, 3485 primary_key=None, 3486 name=job_download_components_name, 3487 paginator=paginator, 3488 config=config, 3489 parameters={}, 3490 ) 3491 3492 def _get_job_timeout() -> datetime.timedelta: 3493 user_defined_timeout: Optional[int] = ( 3494 int( 3495 InterpolatedString.create( 3496 str(model.polling_job_timeout), 3497 parameters={}, 3498 ).eval(config) 3499 ) 3500 if model.polling_job_timeout 3501 else None 3502 ) 3503 3504 # check for user defined timeout during the test read or 15 minutes 3505 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3506 # default value for non-connector builder is 60 minutes. 3507 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3508 3509 return ( 3510 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3511 ) 3512 3513 decoder = ( 3514 self._create_component_from_model(model=model.decoder, config=config) 3515 if model.decoder 3516 else JsonDecoder(parameters={}) 3517 ) 3518 record_selector = self._create_component_from_model( 3519 model=model.record_selector, 3520 config=config, 3521 decoder=decoder, 3522 name=name, 3523 transformations=transformations, 3524 client_side_incremental_sync=client_side_incremental_sync, 3525 ) 3526 3527 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3528 if self._should_limit_slices_fetched(): 3529 stream_slicer = cast( 3530 StreamSlicer, 3531 StreamSlicerTestReadDecorator( 3532 wrapped_slicer=stream_slicer, 3533 maximum_number_of_slices=self._limit_slices_fetched or 5, 3534 ), 3535 ) 3536 3537 creation_requester = self._create_component_from_model( 3538 model=model.creation_requester, 3539 decoder=decoder, 3540 config=config, 3541 name=f"job creation - {name}", 3542 ) 3543 polling_requester = self._create_component_from_model( 3544 model=model.polling_requester, 3545 decoder=decoder, 3546 config=config, 3547 name=f"job polling - {name}", 3548 ) 3549 job_download_components_name = f"job download - {name}" 3550 download_decoder = ( 3551 self._create_component_from_model(model=model.download_decoder, config=config) 3552 if model.download_decoder 3553 else JsonDecoder(parameters={}) 3554 ) 3555 download_extractor = ( 3556 self._create_component_from_model( 3557 model=model.download_extractor, 3558 config=config, 3559 decoder=download_decoder, 3560 parameters=model.parameters, 3561 ) 3562 if model.download_extractor 3563 else DpathExtractor( 3564 [], 3565 config=config, 3566 decoder=download_decoder, 3567 parameters=model.parameters or {}, 3568 ) 3569 ) 3570 download_requester = self._create_component_from_model( 3571 model=model.download_requester, 3572 decoder=download_decoder, 3573 config=config, 3574 name=job_download_components_name, 3575 ) 3576 download_retriever = _get_download_retriever() 3577 abort_requester = ( 3578 self._create_component_from_model( 3579 model=model.abort_requester, 3580 decoder=decoder, 3581 config=config, 3582 name=f"job abort - {name}", 3583 ) 3584 if model.abort_requester 3585 else None 3586 ) 3587 delete_requester = ( 3588 self._create_component_from_model( 3589 model=model.delete_requester, 3590 decoder=decoder, 3591 config=config, 3592 name=f"job delete - {name}", 3593 ) 3594 if model.delete_requester 3595 else None 3596 ) 3597 download_target_requester = ( 3598 self._create_component_from_model( 3599 model=model.download_target_requester, 3600 decoder=decoder, 3601 config=config, 3602 name=f"job extract_url - {name}", 3603 ) 3604 if model.download_target_requester 3605 else None 3606 ) 3607 status_extractor = self._create_component_from_model( 3608 model=model.status_extractor, decoder=decoder, config=config, name=name 3609 ) 3610 download_target_extractor = self._create_component_from_model( 3611 model=model.download_target_extractor, 3612 decoder=decoder, 3613 config=config, 3614 name=name, 3615 ) 3616 3617 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3618 creation_requester=creation_requester, 3619 polling_requester=polling_requester, 3620 download_retriever=download_retriever, 3621 download_target_requester=download_target_requester, 3622 abort_requester=abort_requester, 3623 delete_requester=delete_requester, 3624 status_extractor=status_extractor, 3625 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3626 download_target_extractor=download_target_extractor, 3627 job_timeout=_get_job_timeout(), 3628 ) 3629 3630 async_job_partition_router = AsyncJobPartitionRouter( 3631 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3632 job_repository, 3633 stream_slices, 3634 self._job_tracker, 3635 self._message_repository, 3636 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3637 has_bulk_parent=False, 3638 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3639 # `None` == default retry is set to 3 attempts, under the hood. 3640 job_max_retry=1 if self._emit_connector_builder_messages else None, 3641 ), 3642 stream_slicer=stream_slicer, 3643 config=config, 3644 parameters=model.parameters or {}, 3645 ) 3646 3647 return AsyncRetriever( 3648 record_selector=record_selector, 3649 stream_slicer=async_job_partition_router, 3650 config=config, 3651 parameters=model.parameters or {}, 3652 )
3654 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3655 config_migrations = [ 3656 self._create_component_from_model(migration, config) 3657 for migration in ( 3658 model.config_normalization_rules.config_migrations 3659 if ( 3660 model.config_normalization_rules 3661 and model.config_normalization_rules.config_migrations 3662 ) 3663 else [] 3664 ) 3665 ] 3666 config_transformations = [ 3667 self._create_component_from_model(transformation, config) 3668 for transformation in ( 3669 model.config_normalization_rules.transformations 3670 if ( 3671 model.config_normalization_rules 3672 and model.config_normalization_rules.transformations 3673 ) 3674 else [] 3675 ) 3676 ] 3677 config_validations = [ 3678 self._create_component_from_model(validation, config) 3679 for validation in ( 3680 model.config_normalization_rules.validations 3681 if ( 3682 model.config_normalization_rules 3683 and model.config_normalization_rules.validations 3684 ) 3685 else [] 3686 ) 3687 ] 3688 3689 return Spec( 3690 connection_specification=model.connection_specification, 3691 documentation_url=model.documentation_url, 3692 advanced_auth=model.advanced_auth, 3693 parameters={}, 3694 config_migrations=config_migrations, 3695 config_transformations=config_transformations, 3696 config_validations=config_validations, 3697 )
3699 def create_substream_partition_router( 3700 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3701 ) -> SubstreamPartitionRouter: 3702 parent_stream_configs = [] 3703 if model.parent_stream_configs: 3704 parent_stream_configs.extend( 3705 [ 3706 self._create_message_repository_substream_wrapper( 3707 model=parent_stream_config, config=config, **kwargs 3708 ) 3709 for parent_stream_config in model.parent_stream_configs 3710 ] 3711 ) 3712 3713 return SubstreamPartitionRouter( 3714 parent_stream_configs=parent_stream_configs, 3715 parameters=model.parameters or {}, 3716 config=config, 3717 )
3745 @staticmethod 3746 def create_wait_time_from_header( 3747 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3748 ) -> WaitTimeFromHeaderBackoffStrategy: 3749 return WaitTimeFromHeaderBackoffStrategy( 3750 header=model.header, 3751 parameters=model.parameters or {}, 3752 config=config, 3753 regex=model.regex, 3754 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3755 if model.max_waiting_time_in_seconds is not None 3756 else None, 3757 )
3759 @staticmethod 3760 def create_wait_until_time_from_header( 3761 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3762 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3763 return WaitUntilTimeFromHeaderBackoffStrategy( 3764 header=model.header, 3765 parameters=model.parameters or {}, 3766 config=config, 3767 min_wait=model.min_wait, 3768 regex=model.regex, 3769 )
3777 @staticmethod 3778 def create_components_mapping_definition( 3779 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3780 ) -> ComponentMappingDefinition: 3781 interpolated_value = InterpolatedString.create( 3782 model.value, parameters=model.parameters or {} 3783 ) 3784 field_path = [ 3785 InterpolatedString.create(path, parameters=model.parameters or {}) 3786 for path in model.field_path 3787 ] 3788 return ComponentMappingDefinition( 3789 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3790 value=interpolated_value, 3791 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3792 create_or_update=model.create_or_update, 3793 parameters=model.parameters or {}, 3794 )
3796 def create_http_components_resolver( 3797 self, model: HttpComponentsResolverModel, config: Config 3798 ) -> Any: 3799 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3800 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3801 3802 retriever = self._create_component_from_model( 3803 model=model.retriever, 3804 config=config, 3805 name="", 3806 primary_key=None, 3807 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3808 transformations=[], 3809 ) 3810 3811 components_mapping = [ 3812 self._create_component_from_model( 3813 model=components_mapping_definition_model, 3814 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3815 components_mapping_definition_model.value_type 3816 ), 3817 config=config, 3818 ) 3819 for components_mapping_definition_model in model.components_mapping 3820 ] 3821 3822 return HttpComponentsResolver( 3823 retriever=retriever, 3824 config=config, 3825 components_mapping=components_mapping, 3826 parameters=model.parameters or {}, 3827 )
3829 @staticmethod 3830 def create_stream_config( 3831 model: StreamConfigModel, config: Config, **kwargs: Any 3832 ) -> StreamConfig: 3833 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3834 [x for x in model.configs_pointer] if model.configs_pointer else [] 3835 ) 3836 3837 return StreamConfig( 3838 configs_pointer=model_configs_pointer, 3839 default_values=model.default_values, 3840 parameters=model.parameters or {}, 3841 )
3843 def create_config_components_resolver( 3844 self, model: ConfigComponentsResolverModel, config: Config 3845 ) -> Any: 3846 model_stream_configs = ( 3847 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3848 ) 3849 3850 stream_configs = [ 3851 self._create_component_from_model( 3852 stream_config, config=config, parameters=model.parameters or {} 3853 ) 3854 for stream_config in model_stream_configs 3855 ] 3856 3857 components_mapping = [ 3858 self._create_component_from_model( 3859 model=components_mapping_definition_model, 3860 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3861 components_mapping_definition_model.value_type 3862 ), 3863 config=config, 3864 ) 3865 for components_mapping_definition_model in model.components_mapping 3866 ] 3867 3868 return ConfigComponentsResolver( 3869 stream_configs=stream_configs, 3870 config=config, 3871 components_mapping=components_mapping, 3872 parameters=model.parameters or {}, 3873 )
3875 def create_parametrized_components_resolver( 3876 self, model: ParametrizedComponentsResolverModel, config: Config 3877 ) -> ParametrizedComponentsResolver: 3878 stream_parameters = StreamParametersDefinition( 3879 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 3880 ) 3881 components_mapping = [ 3882 self._create_component_from_model( 3883 model=components_mapping_definition_model, 3884 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3885 components_mapping_definition_model.value_type 3886 ), 3887 config=config, 3888 ) 3889 for components_mapping_definition_model in model.components_mapping 3890 ] 3891 return ParametrizedComponentsResolver( 3892 stream_parameters=stream_parameters, 3893 config=config, 3894 components_mapping=components_mapping, 3895 parameters=model.parameters or {}, 3896 )
3920 def create_http_api_budget( 3921 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3922 ) -> HttpAPIBudget: 3923 policies = [ 3924 self._create_component_from_model(model=policy, config=config) 3925 for policy in model.policies 3926 ] 3927 3928 return HttpAPIBudget( 3929 policies=policies, 3930 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 3931 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 3932 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 3933 )
3935 def create_fixed_window_call_rate_policy( 3936 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 3937 ) -> FixedWindowCallRatePolicy: 3938 matchers = [ 3939 self._create_component_from_model(model=matcher, config=config) 3940 for matcher in model.matchers 3941 ] 3942 3943 # Set the initial reset timestamp to 10 days from now. 3944 # This value will be updated by the first request. 3945 return FixedWindowCallRatePolicy( 3946 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 3947 period=parse_duration(model.period), 3948 call_limit=model.call_limit, 3949 matchers=matchers, 3950 )
3952 def create_file_uploader( 3953 self, model: FileUploaderModel, config: Config, **kwargs: Any 3954 ) -> FileUploader: 3955 name = "File Uploader" 3956 requester = self._create_component_from_model( 3957 model=model.requester, 3958 config=config, 3959 name=name, 3960 **kwargs, 3961 ) 3962 download_target_extractor = self._create_component_from_model( 3963 model=model.download_target_extractor, 3964 config=config, 3965 name=name, 3966 **kwargs, 3967 ) 3968 emit_connector_builder_messages = self._emit_connector_builder_messages 3969 file_uploader = DefaultFileUploader( 3970 requester=requester, 3971 download_target_extractor=download_target_extractor, 3972 config=config, 3973 file_writer=NoopFileWriter() 3974 if emit_connector_builder_messages 3975 else LocalFileSystemFileWriter(), 3976 parameters=model.parameters or {}, 3977 filename_extractor=model.filename_extractor if model.filename_extractor else None, 3978 ) 3979 3980 return ( 3981 ConnectorBuilderFileUploader(file_uploader) 3982 if emit_connector_builder_messages 3983 else file_uploader 3984 )
3986 def create_moving_window_call_rate_policy( 3987 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 3988 ) -> MovingWindowCallRatePolicy: 3989 rates = [ 3990 self._create_component_from_model(model=rate, config=config) for rate in model.rates 3991 ] 3992 matchers = [ 3993 self._create_component_from_model(model=matcher, config=config) 3994 for matcher in model.matchers 3995 ] 3996 return MovingWindowCallRatePolicy( 3997 rates=rates, 3998 matchers=matchers, 3999 )
4001 def create_unlimited_call_rate_policy( 4002 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4003 ) -> UnlimitedCallRatePolicy: 4004 matchers = [ 4005 self._create_component_from_model(model=matcher, config=config) 4006 for matcher in model.matchers 4007 ] 4008 4009 return UnlimitedCallRatePolicy( 4010 matchers=matchers, 4011 )
4020 def create_http_request_matcher( 4021 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4022 ) -> HttpRequestRegexMatcher: 4023 return HttpRequestRegexMatcher( 4024 method=model.method, 4025 url_base=model.url_base, 4026 url_path_pattern=model.url_path_pattern, 4027 params=model.params, 4028 headers=model.headers, 4029 )
4036 def create_grouping_partition_router( 4037 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 4038 ) -> GroupingPartitionRouter: 4039 underlying_router = self._create_component_from_model( 4040 model=model.underlying_partition_router, config=config 4041 ) 4042 if model.group_size < 1: 4043 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4044 4045 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4046 # because they are specific to individual partitions and cannot be aggregated or handled 4047 # when grouping, potentially leading to incorrect API calls. Any request customization 4048 # should be managed at the stream level through the requester's configuration. 4049 if isinstance(underlying_router, SubstreamPartitionRouter): 4050 if any( 4051 parent_config.request_option 4052 for parent_config in underlying_router.parent_stream_configs 4053 ): 4054 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4055 4056 if isinstance(underlying_router, ListPartitionRouter): 4057 if underlying_router.request_option: 4058 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4059 4060 return GroupingPartitionRouter( 4061 group_size=model.group_size, 4062 underlying_partition_router=underlying_router, 4063 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4064 config=config, 4065 )