airbyte.sources.base

Base class implementation for sources.
View Source
  1# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
  2"""Base class implementation for sources."""
  3
  4from __future__ import annotations
  5
  6import sys
  7import threading
  8import time
  9import warnings
 10from itertools import islice
 11from typing import TYPE_CHECKING, Any, Literal
 12
 13import yaml
 14from rich import print  # noqa: A004  # Allow shadowing the built-in
 15from rich.console import Console
 16from rich.markdown import Markdown
 17from rich.markup import escape
 18from rich.table import Table
 19from typing_extensions import override
 20
 21from airbyte_protocol.models import (
 22    AirbyteCatalog,
 23    AirbyteMessage,
 24    ConfiguredAirbyteCatalog,
 25    ConfiguredAirbyteStream,
 26    DestinationSyncMode,
 27    SyncMode,
 28    Type,
 29)
 30
 31from airbyte import exceptions as exc
 32from airbyte._connector_base import ConnectorBase
 33from airbyte._message_iterators import AirbyteMessageIterator
 34from airbyte._util.temp_files import as_temp_files
 35from airbyte.caches.util import get_default_cache
 36from airbyte.datasets._lazy import LazyDataset
 37from airbyte.progress import ProgressStyle, ProgressTracker
 38from airbyte.records import StreamRecord, StreamRecordHandler
 39from airbyte.results import ReadResult
 40from airbyte.shared.catalog_providers import CatalogProvider
 41from airbyte.strategies import WriteStrategy
 42
 43
 44if TYPE_CHECKING:
 45    from collections.abc import Generator, Iterable, Iterator
 46
 47    from airbyte_protocol.models import (
 48        AirbyteStream,
 49        ConnectorSpecification,
 50    )
 51
 52    from airbyte._executors.base import Executor
 53    from airbyte.caches import CacheBase
 54    from airbyte.callbacks import ConfigChangeCallback
 55    from airbyte.datasets._inmemory import InMemoryDataset
 56    from airbyte.documents import Document
 57    from airbyte.shared.state_providers import StateProviderBase
 58    from airbyte.shared.state_writers import StateWriterBase
 59
 60from airbyte.constants import (
 61    AB_EXTRACTED_AT_COLUMN,
 62    AB_META_COLUMN,
 63    AB_RAW_ID_COLUMN,
 64)
 65
 66
 67class Source(ConnectorBase):  # noqa: PLR0904
 68    """A class representing a source that can be called."""
 69
 70    connector_type = "source"
 71
 72    def __init__(
 73        self,
 74        executor: Executor,
 75        name: str,
 76        config: dict[str, Any] | None = None,
 77        *,
 78        config_change_callback: ConfigChangeCallback | None = None,
 79        streams: str | list[str] | None = None,
 80        validate: bool = False,
 81        cursor_key_overrides: dict[str, str] | None = None,
 82        primary_key_overrides: dict[str, str | list[str]] | None = None,
 83    ) -> None:
 84        """Initialize the source.
 85
 86        If config is provided, it will be validated against the spec if validate is True.
 87        """
 88        self._to_be_selected_streams: list[str] | str = []
 89        """Used to hold selection criteria before catalog is known."""
 90
 91        super().__init__(
 92            executor=executor,
 93            name=name,
 94            config=config,
 95            config_change_callback=config_change_callback,
 96            validate=validate,
 97        )
 98        self._config_dict: dict[str, Any] | None = None
 99        self._last_log_messages: list[str] = []
100        self._discovered_catalog: AirbyteCatalog | None = None
101        self._selected_stream_names: list[str] = []
102
103        self._cursor_key_overrides: dict[str, str] = {}
104        """A mapping of lower-cased stream names to cursor key overrides."""
105
106        self._primary_key_overrides: dict[str, list[str]] = {}
107        """A mapping of lower-cased stream names to primary key overrides."""
108
109        if config is not None:
110            self.set_config(config, validate=validate)
111        if streams is not None:
112            self.select_streams(streams)
113        if cursor_key_overrides is not None:
114            self.set_cursor_keys(**cursor_key_overrides)
115        if primary_key_overrides is not None:
116            self.set_primary_keys(**primary_key_overrides)
117
118    def set_streams(self, streams: list[str]) -> None:
119        """Deprecated. See select_streams()."""
120        warnings.warn(
121            "The 'set_streams' method is deprecated and will be removed in a future version. "
122            "Please use the 'select_streams' method instead.",
123            DeprecationWarning,
124            stacklevel=2,
125        )
126        self.select_streams(streams)
127
128    def set_cursor_key(
129        self,
130        stream_name: str,
131        cursor_key: str,
132    ) -> None:
133        """Set the cursor for a single stream.
134
135        Note:
136        - This does not unset previously set cursors.
137        - The cursor key must be a single field name.
138        - Not all streams support custom cursors. If a stream does not support custom cursors,
139          the override may be ignored.
140        - Stream names are case insensitive, while field names are case sensitive.
141        - Stream names are not validated by PyAirbyte. If the stream name
142          does not exist in the catalog, the override may be ignored.
143        """
144        self._cursor_key_overrides[stream_name.lower()] = cursor_key
145
146    def set_cursor_keys(
147        self,
148        **kwargs: str,
149    ) -> None:
150        """Override the cursor key for one or more streams.
151
152        Usage:
153            source.set_cursor_keys(
154                stream1="cursor1",
155                stream2="cursor2",
156            )
157
158        Note:
159        - This does not unset previously set cursors.
160        - The cursor key must be a single field name.
161        - Not all streams support custom cursors. If a stream does not support custom cursors,
162          the override may be ignored.
163        - Stream names are case insensitive, while field names are case sensitive.
164        - Stream names are not validated by PyAirbyte. If the stream name
165          does not exist in the catalog, the override may be ignored.
166        """
167        self._cursor_key_overrides.update({k.lower(): v for k, v in kwargs.items()})
168
169    def set_primary_key(
170        self,
171        stream_name: str,
172        primary_key: str | list[str],
173    ) -> None:
174        """Set the primary key for a single stream.
175
176        Note:
177        - This does not unset previously set primary keys.
178        - The primary key must be a single field name or a list of field names.
179        - Not all streams support overriding primary keys. If a stream does not support overriding
180          primary keys, the override may be ignored.
181        - Stream names are case insensitive, while field names are case sensitive.
182        - Stream names are not validated by PyAirbyte. If the stream name
183          does not exist in the catalog, the override may be ignored.
184        """
185        self._primary_key_overrides[stream_name.lower()] = (
186            primary_key if isinstance(primary_key, list) else [primary_key]
187        )
188
189    def set_primary_keys(
190        self,
191        **kwargs: str | list[str],
192    ) -> None:
193        """Override the primary keys for one or more streams.
194
195        This does not unset previously set primary keys.
196
197        Usage:
198            source.set_primary_keys(
199                stream1="pk1",
200                stream2=["pk1", "pk2"],
201            )
202
203        Note:
204        - This does not unset previously set primary keys.
205        - The primary key must be a single field name or a list of field names.
206        - Not all streams support overriding primary keys. If a stream does not support overriding
207          primary keys, the override may be ignored.
208        - Stream names are case insensitive, while field names are case sensitive.
209        - Stream names are not validated by PyAirbyte. If the stream name
210          does not exist in the catalog, the override may be ignored.
211        """
212        self._primary_key_overrides.update(
213            {k.lower(): v if isinstance(v, list) else [v] for k, v in kwargs.items()}
214        )
215
216    def _log_warning_preselected_stream(self, streams: str | list[str]) -> None:
217        """Logs a warning message indicating stream selection which are not selected yet."""
218        if streams == "*":
219            print(
220                "Warning: Config is not set yet. All streams will be selected after config is set.",
221                file=sys.stderr,
222            )
223        else:
224            print(
225                "Warning: Config is not set yet. "
226                f"Streams to be selected after config is set: {streams}",
227                file=sys.stderr,
228            )
229
230    def select_all_streams(self) -> None:
231        """Select all streams.
232
233        This is a more streamlined equivalent to:
234        > source.select_streams(source.get_available_streams()).
235        """
236        if self._config_dict is None:
237            self._to_be_selected_streams = "*"
238            self._log_warning_preselected_stream(self._to_be_selected_streams)
239            return
240
241        self._selected_stream_names = self.get_available_streams()
242
243    def select_streams(self, streams: str | list[str]) -> None:
244        """Select the stream names that should be read from the connector.
245
246        Args:
247            streams: A list of stream names to select. If set to "*", all streams will be selected.
248
249        Currently, if this is not set, all streams will be read.
250        """
251        if self._config_dict is None:
252            self._to_be_selected_streams = streams
253            self._log_warning_preselected_stream(streams)
254            return
255
256        if streams == "*":
257            self.select_all_streams()
258            return
259
260        if isinstance(streams, str):
261            # If a single stream is provided, convert it to a one-item list
262            streams = [streams]
263
264        available_streams = self.get_available_streams()
265        for stream in streams:
266            if stream not in available_streams:
267                raise exc.AirbyteStreamNotFoundError(
268                    stream_name=stream,
269                    connector_name=self.name,
270                    available_streams=available_streams,
271                )
272        self._selected_stream_names = streams
273
274    def get_selected_streams(self) -> list[str]:
275        """Get the selected streams.
276
277        If no streams are selected, return an empty list.
278        """
279        return self._selected_stream_names
280
281    def set_config(
282        self,
283        config: dict[str, Any],
284        *,
285        validate: bool = True,
286    ) -> None:
287        """Set the config for the connector.
288
289        If validate is True, raise an exception if the config fails validation.
290
291        If validate is False, validation will be deferred until check() or validate_config()
292        is called.
293        """
294        if validate:
295            self.validate_config(config)
296
297        self._config_dict = config
298
299        if self._to_be_selected_streams:
300            self.select_streams(self._to_be_selected_streams)
301            self._to_be_selected_streams = []
302
303    def _discover(self) -> AirbyteCatalog:
304        """Call discover on the connector.
305
306        This involves the following steps:
307        - Write the config to a temporary file
308        - execute the connector with discover --config <config_file>
309        - Listen to the messages and return the first AirbyteCatalog that comes along.
310        - Make sure the subprocess is killed when the function returns.
311        """
312        with as_temp_files([self._hydrated_config]) as [config_file]:
313            for msg in self._execute(["discover", "--config", config_file]):
314                if msg.type == Type.CATALOG and msg.catalog:
315                    return msg.catalog
316            raise exc.AirbyteConnectorMissingCatalogError(
317                connector_name=self.name,
318                log_text=self._last_log_messages,
319            )
320
321    def get_available_streams(self) -> list[str]:
322        """Get the available streams from the spec."""
323        return [s.name for s in self.discovered_catalog.streams]
324
325    def _get_incremental_stream_names(self) -> list[str]:
326        """Get the name of streams that support incremental sync."""
327        return [
328            stream.name
329            for stream in self.discovered_catalog.streams
330            if SyncMode.incremental in stream.supported_sync_modes
331        ]
332
333    @override
334    def _get_spec(self, *, force_refresh: bool = False) -> ConnectorSpecification:
335        """Call spec on the connector.
336
337        This involves the following steps:
338        * execute the connector with spec
339        * Listen to the messages and return the first AirbyteCatalog that comes along.
340        * Make sure the subprocess is killed when the function returns.
341        """
342        if force_refresh or self._spec is None:
343            for msg in self._execute(["spec"]):
344                if msg.type == Type.SPEC and msg.spec:
345                    self._spec = msg.spec
346                    break
347
348        if self._spec:
349            return self._spec
350
351        raise exc.AirbyteConnectorMissingSpecError(
352            connector_name=self.name,
353            log_text=self._last_log_messages,
354        )
355
356    @property
357    def config_spec(self) -> dict[str, Any]:
358        """Generate a configuration spec for this connector, as a JSON Schema definition.
359
360        This function generates a JSON Schema dictionary with configuration specs for the
361        current connector, as a dictionary.
362
363        Returns:
364            dict: The JSON Schema configuration spec as a dictionary.
365        """
366        return self._get_spec(force_refresh=True).connectionSpecification
367
368    @property
369    def _yaml_spec(self) -> str:
370        """Get the spec as a yaml string.
371
372        For now, the primary use case is for writing and debugging a valid config for a source.
373
374        This is private for now because we probably want better polish before exposing this
375        as a stable interface. This will also get easier when we have docs links with this info
376        for each connector.
377        """
378        spec_obj: ConnectorSpecification = self._get_spec()
379        spec_dict: dict[str, Any] = spec_obj.model_dump(exclude_unset=True)
380        # convert to a yaml string
381        return yaml.dump(spec_dict)
382
383    @property
384    def docs_url(self) -> str:
385        """Get the URL to the connector's documentation."""
386        return "https://docs.airbyte.com/integrations/sources/" + self.name.lower().replace(
387            "source-", ""
388        )
389
390    @property
391    def discovered_catalog(self) -> AirbyteCatalog:
392        """Get the raw catalog for the given streams.
393
394        If the catalog is not yet known, we call discover to get it.
395        """
396        if self._discovered_catalog is None:
397            self._discovered_catalog = self._discover()
398
399        return self._discovered_catalog
400
401    @property
402    def configured_catalog(self) -> ConfiguredAirbyteCatalog:
403        """Get the configured catalog for the given streams.
404
405        If the raw catalog is not yet known, we call discover to get it.
406
407        If no specific streams are selected, we return a catalog that syncs all available streams.
408
409        TODO: We should consider disabling by default the streams that the connector would
410        disable by default. (For instance, streams that require a premium license are sometimes
411        disabled by default within the connector.)
412        """
413        # Ensure discovered catalog is cached before we start
414        _ = self.discovered_catalog
415
416        # Filter for selected streams if set, otherwise use all available streams:
417        streams_filter: list[str] = self._selected_stream_names or self.get_available_streams()
418        return self.get_configured_catalog(streams=streams_filter)
419
420    def get_configured_catalog(
421        self,
422        streams: Literal["*"] | list[str] | None = None,
423    ) -> ConfiguredAirbyteCatalog:
424        """Get a configured catalog for the given streams.
425
426        If no streams are provided, the selected streams will be used. If no streams are selected,
427        all available streams will be used.
428
429        If '*' is provided, all available streams will be used.
430        """
431        selected_streams: list[str] = []
432        if streams is None:
433            selected_streams = self._selected_stream_names or self.get_available_streams()
434        elif streams == "*":
435            selected_streams = self.get_available_streams()
436        elif isinstance(streams, list):
437            selected_streams = streams
438        else:
439            raise exc.PyAirbyteInputError(
440                message="Invalid streams argument.",
441                input_value=streams,
442            )
443
444        return ConfiguredAirbyteCatalog(
445            streams=[
446                ConfiguredAirbyteStream(
447                    stream=stream,
448                    destination_sync_mode=DestinationSyncMode.overwrite,
449                    sync_mode=SyncMode.incremental,
450                    primary_key=(
451                        [self._primary_key_overrides[stream.name.lower()]]
452                        if stream.name.lower() in self._primary_key_overrides
453                        else stream.source_defined_primary_key
454                    ),
455                    cursor_field=(
456                        [self._cursor_key_overrides[stream.name.lower()]]
457                        if stream.name.lower() in self._cursor_key_overrides
458                        else stream.default_cursor_field
459                    ),
460                    # These are unused in the current implementation:
461                    generation_id=None,
462                    minimum_generation_id=None,
463                    sync_id=None,
464                )
465                for stream in self.discovered_catalog.streams
466                if stream.name in selected_streams
467            ],
468        )
469
470    def get_stream_json_schema(self, stream_name: str) -> dict[str, Any]:
471        """Return the JSON Schema spec for the specified stream name."""
472        catalog: AirbyteCatalog = self.discovered_catalog
473        found: list[AirbyteStream] = [
474            stream for stream in catalog.streams if stream.name == stream_name
475        ]
476
477        if len(found) == 0:
478            raise exc.PyAirbyteInputError(
479                message="Stream name does not exist in catalog.",
480                input_value=stream_name,
481            )
482
483        if len(found) > 1:
484            raise exc.PyAirbyteInternalError(
485                message="Duplicate streams found with the same name.",
486                context={
487                    "found_streams": found,
488                },
489            )
490
491        return found[0].json_schema
492
493    def get_records(
494        self,
495        stream: str,
496        *,
497        limit: int | None = None,
498        stop_event: threading.Event | None = None,
499        normalize_field_names: bool = False,
500        prune_undeclared_fields: bool = True,
501    ) -> LazyDataset:
502        """Read a stream from the connector.
503
504        Args:
505            stream: The name of the stream to read.
506            limit: The maximum number of records to read. If None, all records will be read.
507            stop_event: If set, the event can be triggered by the caller to stop reading records
508                and terminate the process.
509            normalize_field_names: When `True`, field names will be normalized to lower case, with
510                special characters removed. This matches the behavior of PyAirbyte caches and most
511                Airbyte destinations.
512            prune_undeclared_fields: When `True`, undeclared fields will be pruned from the records,
513                which generally matches the behavior of PyAirbyte caches and most Airbyte
514                destinations, specifically when you expect the catalog may be stale. You can disable
515                this to keep all fields in the records.
516
517        This involves the following steps:
518        * Call discover to get the catalog
519        * Generate a configured catalog that syncs the given stream in full_refresh mode
520        * Write the configured catalog and the config to a temporary file
521        * execute the connector with read --config <config_file> --catalog <catalog_file>
522        * Listen to the messages and return the first AirbyteRecordMessages that come along.
523        * Make sure the subprocess is killed when the function returns.
524        """
525        stop_event = stop_event or threading.Event()
526        configured_catalog = self.get_configured_catalog(streams=[stream])
527        if len(configured_catalog.streams) == 0:
528            raise exc.PyAirbyteInputError(
529                message="Requested stream does not exist.",
530                context={
531                    "stream": stream,
532                    "available_streams": self.get_available_streams(),
533                    "connector_name": self.name,
534                },
535            ) from KeyError(stream)
536
537        configured_stream = configured_catalog.streams[0]
538
539        def _with_logging(records: Iterable[dict[str, Any]]) -> Iterator[dict[str, Any]]:
540            yield from records
541
542        stream_record_handler = StreamRecordHandler(
543            json_schema=self.get_stream_json_schema(stream),
544            prune_extra_fields=prune_undeclared_fields,
545            normalize_keys=normalize_field_names,
546        )
547
548        # This method is non-blocking, so we use "PLAIN" to avoid a live progress display
549        progress_tracker = ProgressTracker(
550            ProgressStyle.PLAIN,
551            source=self,
552            cache=None,
553            destination=None,
554            expected_streams=[stream],
555        )
556
557        iterator: Iterator[dict[str, Any]] = (
558            StreamRecord.from_record_message(
559                record_message=record.record,
560                stream_record_handler=stream_record_handler,
561            )
562            for record in self._read_with_catalog(
563                catalog=configured_catalog,
564                progress_tracker=progress_tracker,
565                stop_event=stop_event,
566            )
567            if record.record
568        )
569        if limit is not None:
570            # Stop the iterator after the limit is reached
571            iterator = islice(iterator, limit)
572
573        return LazyDataset(
574            iterator,
575            stream_metadata=configured_stream,
576            stop_event=stop_event,
577            progress_tracker=progress_tracker,
578        )
579
580    def get_documents(
581        self,
582        stream: str,
583        title_property: str | None = None,
584        content_properties: list[str] | None = None,
585        metadata_properties: list[str] | None = None,
586        *,
587        render_metadata: bool = False,
588    ) -> Iterable[Document]:
589        """Read a stream from the connector and return the records as documents.
590
591        If metadata_properties is not set, all properties that are not content will be added to
592        the metadata.
593
594        If render_metadata is True, metadata will be rendered in the document, as well as the
595        the main content.
596        """
597        return self.get_records(stream).to_documents(
598            title_property=title_property,
599            content_properties=content_properties,
600            metadata_properties=metadata_properties,
601            render_metadata=render_metadata,
602        )
603
604    def get_samples(
605        self,
606        streams: list[str] | Literal["*"] | None = None,
607        *,
608        limit: int = 5,
609        on_error: Literal["raise", "ignore", "log"] = "raise",
610    ) -> dict[str, InMemoryDataset | None]:
611        """Get a sample of records from the given streams."""
612        if streams == "*":
613            streams = self.get_available_streams()
614        elif streams is None:
615            streams = self.get_selected_streams()
616
617        results: dict[str, InMemoryDataset | None] = {}
618        for stream in streams:
619            stop_event = threading.Event()
620            try:
621                results[stream] = self.get_records(
622                    stream,
623                    limit=limit,
624                    stop_event=stop_event,
625                ).fetch_all()
626                stop_event.set()
627            except Exception as ex:
628                results[stream] = None
629                if on_error == "ignore":
630                    continue
631
632                if on_error == "raise":
633                    raise ex from None
634
635                if on_error == "log":
636                    print(f"Error fetching sample for stream '{stream}': {ex}")
637
638        return results
639
640    def print_samples(
641        self,
642        streams: list[str] | Literal["*"] | None = None,
643        *,
644        limit: int = 5,
645        on_error: Literal["raise", "ignore", "log"] = "log",
646    ) -> None:
647        """Print a sample of records from the given streams."""
648        internal_cols: list[str] = [
649            AB_EXTRACTED_AT_COLUMN,
650            AB_META_COLUMN,
651            AB_RAW_ID_COLUMN,
652        ]
653        col_limit = 10
654        if streams == "*":
655            streams = self.get_available_streams()
656        elif streams is None:
657            streams = self.get_selected_streams()
658
659        console = Console()
660
661        console.print(
662            Markdown(
663                f"# Sample Records from `{self.name}` ({len(streams)} selected streams)",
664                justify="left",
665            )
666        )
667
668        for stream in streams:
669            console.print(Markdown(f"## `{stream}` Stream Sample", justify="left"))
670            samples = self.get_samples(
671                streams=[stream],
672                limit=limit,
673                on_error=on_error,
674            )
675            dataset = samples[stream]
676
677            table = Table(
678                show_header=True,
679                show_lines=True,
680            )
681            if dataset is None:
682                console.print(
683                    Markdown("**⚠️ `Error fetching sample records.` ⚠️**"),
684                )
685                continue
686
687            if len(dataset.column_names) > col_limit:
688                # We'll pivot the columns so each column is its own row
689                table.add_column("Column Name")
690                for _ in range(len(dataset)):
691                    table.add_column(overflow="fold")
692                for col in dataset.column_names:
693                    table.add_row(
694                        Markdown(f"**`{col}`**"),
695                        *[escape(str(record[col])) for record in dataset],
696                    )
697            else:
698                for col in dataset.column_names:
699                    table.add_column(
700                        Markdown(f"**`{col}`**"),
701                        overflow="fold",
702                    )
703
704                for record in dataset:
705                    table.add_row(
706                        *[
707                            escape(str(val))
708                            for key, val in record.items()
709                            # Exclude internal Airbyte columns.
710                            if key not in internal_cols
711                        ]
712                    )
713
714            console.print(table)
715
716        console.print(Markdown("--------------"))
717
718    def _get_airbyte_message_iterator(
719        self,
720        *,
721        streams: Literal["*"] | list[str] | None = None,
722        state_provider: StateProviderBase | None = None,
723        progress_tracker: ProgressTracker,
724        force_full_refresh: bool = False,
725    ) -> AirbyteMessageIterator:
726        """Get an AirbyteMessageIterator for this source."""
727        return AirbyteMessageIterator(
728            self._read_with_catalog(
729                catalog=self.get_configured_catalog(streams=streams),
730                state=state_provider if not force_full_refresh else None,
731                progress_tracker=progress_tracker,
732            )
733        )
734
735    def _read_with_catalog(
736        self,
737        catalog: ConfiguredAirbyteCatalog,
738        progress_tracker: ProgressTracker,
739        *,
740        state: StateProviderBase | None = None,
741        stop_event: threading.Event | None = None,
742    ) -> Generator[AirbyteMessage, None, None]:
743        """Call read on the connector.
744
745        This involves the following steps:
746        * Write the config to a temporary file
747        * execute the connector with read --config <config_file> --catalog <catalog_file>
748        * Listen to the messages and return the AirbyteRecordMessages that come along.
749        * Send out telemetry on the performed sync (with information about which source was used and
750          the type of the cache)
751        """
752        with as_temp_files(
753            [
754                self._hydrated_config,
755                catalog.model_dump_json(),
756                state.to_state_input_file_text() if state else "[]",
757            ]
758        ) as [
759            config_file,
760            catalog_file,
761            state_file,
762        ]:
763            message_generator = self._execute(
764                [
765                    "read",
766                    "--config",
767                    config_file,
768                    "--catalog",
769                    catalog_file,
770                    "--state",
771                    state_file,
772                ],
773                progress_tracker=progress_tracker,
774            )
775            for message in progress_tracker.tally_records_read(message_generator):
776                if stop_event and stop_event.is_set():
777                    progress_tracker._log_sync_cancel()  # noqa: SLF001
778                    time.sleep(0.1)
779                    return
780
781                yield message
782
783        progress_tracker.log_read_complete()
784
785    def _peek_airbyte_message(
786        self,
787        message: AirbyteMessage,
788        *,
789        raise_on_error: bool = True,
790    ) -> None:
791        """Process an Airbyte message.
792
793        This method handles reading Airbyte messages and taking action, if needed, based on the
794        message type. For instance, log messages are logged, records are tallied, and errors are
795        raised as exceptions if `raise_on_error` is True.
796
797        Raises:
798            AirbyteConnectorFailedError: If a TRACE message of type ERROR is emitted.
799        """
800        super()._peek_airbyte_message(message, raise_on_error=raise_on_error)
801
802    def _log_incremental_streams(
803        self,
804        *,
805        incremental_streams: set[str] | None = None,
806    ) -> None:
807        """Log the streams which are using incremental sync mode."""
808        log_message = (
809            "The following streams are currently using incremental sync:\n"
810            f"{incremental_streams}\n"
811            "To perform a full refresh, set 'force_full_refresh=True' in 'airbyte.read()' method."
812        )
813        print(log_message, file=sys.stderr)
814
815    def read(
816        self,
817        cache: CacheBase | None = None,
818        *,
819        streams: str | list[str] | None = None,
820        write_strategy: str | WriteStrategy = WriteStrategy.AUTO,
821        force_full_refresh: bool = False,
822        skip_validation: bool = False,
823    ) -> ReadResult:
824        """Read from the connector and write to the cache.
825
826        Args:
827            cache: The cache to write to. If not set, a default cache will be used.
828            streams: Optional if already set. A list of stream names to select for reading. If set
829                to "*", all streams will be selected.
830            write_strategy: The strategy to use when writing to the cache. If a string, it must be
831                one of "append", "merge", "replace", or "auto". If a WriteStrategy, it must be one
832                of WriteStrategy.APPEND, WriteStrategy.MERGE, WriteStrategy.REPLACE, or
833                WriteStrategy.AUTO.
834            force_full_refresh: If True, the source will operate in full refresh mode. Otherwise,
835                streams will be read in incremental mode if supported by the connector. This option
836                must be True when using the "replace" strategy.
837            skip_validation: If True, PyAirbyte will not pre-validate the input configuration before
838                running the connector. This can be helpful in debugging, when you want to send
839                configurations to the connector that otherwise might be rejected by JSON Schema
840                validation rules.
841        """
842        cache = cache or get_default_cache()
843        progress_tracker = ProgressTracker(
844            source=self,
845            cache=cache,
846            destination=None,
847            expected_streams=None,  # Will be set later
848        )
849
850        # Set up state provider if not in full refresh mode
851        if force_full_refresh:
852            state_provider: StateProviderBase | None = None
853        else:
854            state_provider = cache.get_state_provider(
855                source_name=self._name,
856            )
857        state_writer = cache.get_state_writer(source_name=self._name)
858
859        if streams:
860            self.select_streams(streams)
861
862        if not self._selected_stream_names:
863            raise exc.PyAirbyteNoStreamsSelectedError(
864                connector_name=self.name,
865                available_streams=self.get_available_streams(),
866            )
867
868        try:
869            result = self._read_to_cache(
870                cache=cache,
871                catalog_provider=CatalogProvider(self.configured_catalog),
872                stream_names=self._selected_stream_names,
873                state_provider=state_provider,
874                state_writer=state_writer,
875                write_strategy=write_strategy,
876                force_full_refresh=force_full_refresh,
877                skip_validation=skip_validation,
878                progress_tracker=progress_tracker,
879            )
880        except exc.PyAirbyteInternalError as ex:
881            progress_tracker.log_failure(exception=ex)
882            raise exc.AirbyteConnectorFailedError(
883                connector_name=self.name,
884                log_text=self._last_log_messages,
885            ) from ex
886        except Exception as ex:
887            progress_tracker.log_failure(exception=ex)
888            raise
889
890        progress_tracker.log_success()
891        return result
892
893    def _read_to_cache(  # noqa: PLR0913  # Too many arguments
894        self,
895        cache: CacheBase,
896        *,
897        catalog_provider: CatalogProvider,
898        stream_names: list[str],
899        state_provider: StateProviderBase | None,
900        state_writer: StateWriterBase | None,
901        write_strategy: str | WriteStrategy = WriteStrategy.AUTO,
902        force_full_refresh: bool = False,
903        skip_validation: bool = False,
904        progress_tracker: ProgressTracker,
905    ) -> ReadResult:
906        """Internal read method."""
907        if write_strategy == WriteStrategy.REPLACE and not force_full_refresh:
908            warnings.warn(
909                message=(
910                    "Using `REPLACE` strategy without also setting `full_refresh_mode=True` "
911                    "could result in data loss. "
912                    "To silence this warning, use the following: "
913                    'warnings.filterwarnings("ignore", '
914                    'category="airbyte.warnings.PyAirbyteDataLossWarning")`'
915                ),
916                category=exc.PyAirbyteDataLossWarning,
917                stacklevel=1,
918            )
919        if isinstance(write_strategy, str):
920            try:
921                write_strategy = WriteStrategy(write_strategy)
922            except ValueError:
923                raise exc.PyAirbyteInputError(
924                    message="Invalid strategy",
925                    context={
926                        "write_strategy": write_strategy,
927                        "available_strategies": [s.value for s in WriteStrategy],
928                    },
929                ) from None
930
931        # Run optional validation step
932        if not skip_validation:
933            self.validate_config()
934
935        # Log incremental stream if incremental streams are known
936        if state_provider and state_provider.known_stream_names:
937            # Retrieve set of the known streams support which support incremental sync
938            incremental_streams = (
939                set(self._get_incremental_stream_names())
940                & state_provider.known_stream_names
941                & set(self.get_selected_streams())
942            )
943            if incremental_streams:
944                self._log_incremental_streams(incremental_streams=incremental_streams)
945
946        airbyte_message_iterator = AirbyteMessageIterator(
947            self._read_with_catalog(
948                catalog=catalog_provider.configured_catalog,
949                state=state_provider,
950                progress_tracker=progress_tracker,
951            )
952        )
953        cache._write_airbyte_message_stream(  # noqa: SLF001  # Non-public API
954            stdin=airbyte_message_iterator,
955            catalog_provider=catalog_provider,
956            write_strategy=write_strategy,
957            state_writer=state_writer,
958            progress_tracker=progress_tracker,
959        )
960
961        # Flush the WAL, if applicable
962        cache.processor._do_checkpoint()  # noqa: SLF001  # Non-public API
963
964        return ReadResult(
965            source_name=self.name,
966            progress_tracker=progress_tracker,
967            processed_streams=stream_names,
968            cache=cache,
969        )
970
971
972__all__ = [
973    "Source",
974]
airbyte.sources.base

Usage:

Usage:

Arguments:

Returns:

Arguments:

Arguments:

Inherited Members