airbyte_cdk.destinations.vector_db_based.config

View Source

  1#
  2# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
  3#
  4
  5from typing import Any, Dict, List, Literal, Optional, Union
  6
  7import dpath
  8from pydantic.v1 import BaseModel, Field
  9
 10from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 11from airbyte_cdk.utils.spec_schema_transformations import resolve_refs
 12
 13
 14class SeparatorSplitterConfigModel(BaseModel):
 15    mode: Literal["separator"] = Field("separator", const=True)
 16    separators: List[str] = Field(
 17        default=['"\\n\\n"', '"\\n"', '" "', '""'],
 18        title="Separators",
 19        description='List of separator strings to split text fields by. The separator itself needs to be wrapped in double quotes, e.g. to split by the dot character, use ".". To split by a newline, use "\\n".',
 20    )
 21    keep_separator: bool = Field(
 22        default=False,
 23        title="Keep separator",
 24        description="Whether to keep the separator in the resulting chunks",
 25    )
 26
 27    class Config(OneOfOptionConfig):
 28        title = "By Separator"
 29        description = "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc."
 30        discriminator = "mode"
 31
 32
 33class MarkdownHeaderSplitterConfigModel(BaseModel):
 34    mode: Literal["markdown"] = Field("markdown", const=True)
 35    split_level: int = Field(
 36        default=1,
 37        title="Split level",
 38        description="Level of markdown headers to split text fields by. Headings down to the specified level will be used as split points",
 39        le=6,
 40        ge=1,
 41    )
 42
 43    class Config(OneOfOptionConfig):
 44        title = "By Markdown header"
 45        description = "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk."
 46        discriminator = "mode"
 47
 48
 49class CodeSplitterConfigModel(BaseModel):
 50    mode: Literal["code"] = Field("code", const=True)
 51    language: str = Field(
 52        title="Language",
 53        description="Split code in suitable places based on the programming language",
 54        enum=[
 55            "cpp",
 56            "go",
 57            "java",
 58            "js",
 59            "php",
 60            "proto",
 61            "python",
 62            "rst",
 63            "ruby",
 64            "rust",
 65            "scala",
 66            "swift",
 67            "markdown",
 68            "latex",
 69            "html",
 70            "sol",
 71        ],
 72    )
 73
 74    class Config(OneOfOptionConfig):
 75        title = "By Programming Language"
 76        description = "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
 77        discriminator = "mode"
 78
 79
 80TextSplitterConfigModel = Union[
 81    SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel
 82]
 83
 84
 85class FieldNameMappingConfigModel(BaseModel):
 86    from_field: str = Field(title="From field name", description="The field name in the source")
 87    to_field: str = Field(
 88        title="To field name", description="The field name to use in the destination"
 89    )
 90
 91
 92class ProcessingConfigModel(BaseModel):
 93    chunk_size: int = Field(
 94        ...,
 95        title="Chunk size",
 96        maximum=8191,
 97        minimum=1,
 98        description="Size of chunks in tokens to store in vector store (make sure it is not too big for the context if your LLM)",
 99    )
100    chunk_overlap: int = Field(
101        title="Chunk overlap",
102        description="Size of overlap between chunks in tokens to store in vector store to better capture relevant context",
103        default=0,
104    )
105    text_fields: Optional[List[str]] = Field(
106        default=[],
107        title="Text fields to embed",
108        description="List of fields in the record that should be used to calculate the embedding. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered text fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array.",
109        always_show=True,
110        examples=["text", "user.name", "users.*.name"],
111    )
112    metadata_fields: Optional[List[str]] = Field(
113        default=[],
114        title="Fields to store as metadata",
115        description="List of fields in the record that should be stored as metadata. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered metadata fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array. When specifying nested paths, all matching values are flattened into an array set to a field named by the path.",
116        always_show=True,
117        examples=["age", "user", "user.name"],
118    )
119    text_splitter: TextSplitterConfigModel = Field(
120        default=None,
121        title="Text splitter",
122        discriminator="mode",
123        type="object",
124        description="Split text fields into chunks based on the specified method.",
125    )
126    field_name_mappings: Optional[List[FieldNameMappingConfigModel]] = Field(
127        default=[],
128        title="Field name mappings",
129        description="List of fields to rename. Not applicable for nested fields, but can be used to rename fields already flattened via dot notation.",
130    )
131
132    class Config:
133        schema_extra = {"group": "processing"}
134
135
136class OpenAIEmbeddingConfigModel(BaseModel):
137    mode: Literal["openai"] = Field("openai", const=True)
138    openai_key: str = Field(..., title="OpenAI API key", airbyte_secret=True)
139
140    class Config(OneOfOptionConfig):
141        title = "OpenAI"
142        description = "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
143        discriminator = "mode"
144
145
146class OpenAICompatibleEmbeddingConfigModel(BaseModel):
147    mode: Literal["openai_compatible"] = Field("openai_compatible", const=True)
148    api_key: str = Field(title="API key", default="", airbyte_secret=True)
149    base_url: str = Field(
150        ...,
151        title="Base URL",
152        description="The base URL for your OpenAI-compatible service",
153        examples=["https://your-service-name.com"],
154    )
155    model_name: str = Field(
156        title="Model name",
157        description="The name of the model to use for embedding",
158        default="text-embedding-ada-002",
159        examples=["text-embedding-ada-002"],
160    )
161    dimensions: int = Field(
162        title="Embedding dimensions",
163        description="The number of dimensions the embedding model is generating",
164        examples=[1536, 384],
165    )
166
167    class Config(OneOfOptionConfig):
168        title = "OpenAI-compatible"
169        description = "Use a service that's compatible with the OpenAI API to embed text."
170        discriminator = "mode"
171
172
173class AzureOpenAIEmbeddingConfigModel(BaseModel):
174    mode: Literal["azure_openai"] = Field("azure_openai", const=True)
175    openai_key: str = Field(
176        ...,
177        title="Azure OpenAI API key",
178        airbyte_secret=True,
179        description="The API key for your Azure OpenAI resource.  You can find this in the Azure portal under your Azure OpenAI resource",
180    )
181    api_base: str = Field(
182        ...,
183        title="Resource base URL",
184        description="The base URL for your Azure OpenAI resource.  You can find this in the Azure portal under your Azure OpenAI resource",
185        examples=["https://your-resource-name.openai.azure.com"],
186    )
187    deployment: str = Field(
188        ...,
189        title="Deployment",
190        description="The deployment for your Azure OpenAI resource.  You can find this in the Azure portal under your Azure OpenAI resource",
191        examples=["your-resource-name"],
192    )
193
194    class Config(OneOfOptionConfig):
195        title = "Azure OpenAI"
196        description = "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
197        discriminator = "mode"
198
199
200class FakeEmbeddingConfigModel(BaseModel):
201    mode: Literal["fake"] = Field("fake", const=True)
202
203    class Config(OneOfOptionConfig):
204        title = "Fake"
205        description = "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs."
206        discriminator = "mode"
207
208
209class FromFieldEmbeddingConfigModel(BaseModel):
210    mode: Literal["from_field"] = Field("from_field", const=True)
211    field_name: str = Field(
212        ...,
213        title="Field name",
214        description="Name of the field in the record that contains the embedding",
215        examples=["embedding", "vector"],
216    )
217    dimensions: int = Field(
218        ...,
219        title="Embedding dimensions",
220        description="The number of dimensions the embedding model is generating",
221        examples=[1536, 384],
222    )
223
224    class Config(OneOfOptionConfig):
225        title = "From Field"
226        description = "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
227        discriminator = "mode"
228
229
230class CohereEmbeddingConfigModel(BaseModel):
231    mode: Literal["cohere"] = Field("cohere", const=True)
232    cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True)
233
234    class Config(OneOfOptionConfig):
235        title = "Cohere"
236        description = "Use the Cohere API to embed text."
237        discriminator = "mode"
238
239
240class VectorDBConfigModel(BaseModel):
241    """
242    The configuration model for the Vector DB based destinations. This model is used to generate the UI for the destination configuration,
243    as well as to provide type safety for the configuration passed to the destination.
244
245    The configuration model is composed of four parts:
246    * Processing configuration
247    * Embedding configuration
248    * Indexing configuration
249    * Advanced configuration
250
251    Processing, embedding and advanced configuration are provided by this base class, while the indexing configuration is provided by the destination connector in the sub class.
252    """
253
254    embedding: Union[
255        OpenAIEmbeddingConfigModel,
256        CohereEmbeddingConfigModel,
257        FakeEmbeddingConfigModel,
258        AzureOpenAIEmbeddingConfigModel,
259        OpenAICompatibleEmbeddingConfigModel,
260    ] = Field(
261        ...,
262        title="Embedding",
263        description="Embedding configuration",
264        discriminator="mode",
265        group="embedding",
266        type="object",
267    )
268    processing: ProcessingConfigModel
269    omit_raw_text: bool = Field(
270        default=False,
271        title="Do not store raw text",
272        group="advanced",
273        description="Do not store the text that gets embedded along with the vector and the metadata in the destination. If set to true, only the vector and the metadata will be stored - in this case raw text for LLM use cases needs to be retrieved from another source.",
274    )
275
276    class Config:
277        title = "Destination Config"
278        schema_extra = {
279            "groups": [
280                {"id": "processing", "title": "Processing"},
281                {"id": "embedding", "title": "Embedding"},
282                {"id": "indexing", "title": "Indexing"},
283                {"id": "advanced", "title": "Advanced"},
284            ]
285        }
286
287    @staticmethod
288    def remove_discriminator(schema: Dict[str, Any]) -> None:
289        """pydantic adds "discriminator" to the schema for oneOfs, which is not treated right by the platform as we inline all references"""
290        dpath.delete(schema, "properties/**/discriminator")
291
292    @classmethod
293    def schema(cls, by_alias: bool = True, ref_template: str = "") -> Dict[str, Any]:
294        """we're overriding the schema classmethod to enable some post-processing"""
295        schema: Dict[str, Any] = super().schema()
296        schema = resolve_refs(schema)
297        cls.remove_discriminator(schema)
298        return schema

class SeparatorSplitterConfigModel(pydantic.v1.main.BaseModel): View Source

15class SeparatorSplitterConfigModel(BaseModel):
16    mode: Literal["separator"] = Field("separator", const=True)
17    separators: List[str] = Field(
18        default=['"\\n\\n"', '"\\n"', '" "', '""'],
19        title="Separators",
20        description='List of separator strings to split text fields by. The separator itself needs to be wrapped in double quotes, e.g. to split by the dot character, use ".". To split by a newline, use "\\n".',
21    )
22    keep_separator: bool = Field(
23        default=False,
24        title="Keep separator",
25        description="Whether to keep the separator in the resulting chunks",
26    )
27
28    class Config(OneOfOptionConfig):
29        title = "By Separator"
30        description = "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc."
31        discriminator = "mode"

mode: Literal['separator']

separators: List[str]

keep_separator: bool

class SeparatorSplitterConfigModel.Config(airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig): View Source

28    class Config(OneOfOptionConfig):
29        title = "By Separator"
30        description = "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc."
31        discriminator = "mode"

Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.

Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).

Usage:

class OptionModel(BaseModel):
    mode: Literal["option_a"] = Field("option_a", const=True)
    option_a_field: str = Field(...)

    class Config(OneOfOptionConfig):
        title = "Option A"
        description = "Option A description"
        discriminator = "mode"

title = 'By Separator'

description = 'Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc.'

discriminator = 'mode'

Inherited Members

airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig: schema_extra

class MarkdownHeaderSplitterConfigModel(pydantic.v1.main.BaseModel): View Source

34class MarkdownHeaderSplitterConfigModel(BaseModel):
35    mode: Literal["markdown"] = Field("markdown", const=True)
36    split_level: int = Field(
37        default=1,
38        title="Split level",
39        description="Level of markdown headers to split text fields by. Headings down to the specified level will be used as split points",
40        le=6,
41        ge=1,
42    )
43
44    class Config(OneOfOptionConfig):
45        title = "By Markdown header"
46        description = "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk."
47        discriminator = "mode"

mode: Literal['markdown']

split_level: int

class MarkdownHeaderSplitterConfigModel.Config(airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig): View Source

44    class Config(OneOfOptionConfig):
45        title = "By Markdown header"
46        description = "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk."
47        discriminator = "mode"

Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.

Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).

Usage:

class OptionModel(BaseModel):
    mode: Literal["option_a"] = Field("option_a", const=True)
    option_a_field: str = Field(...)

    class Config(OneOfOptionConfig):
        title = "Option A"
        description = "Option A description"
        discriminator = "mode"

title = 'By Markdown header'

description = 'Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk.'

discriminator = 'mode'

Inherited Members

airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig: schema_extra

class CodeSplitterConfigModel(pydantic.v1.main.BaseModel): View Source

50class CodeSplitterConfigModel(BaseModel):
51    mode: Literal["code"] = Field("code", const=True)
52    language: str = Field(
53        title="Language",
54        description="Split code in suitable places based on the programming language",
55        enum=[
56            "cpp",
57            "go",
58            "java",
59            "js",
60            "php",
61            "proto",
62            "python",
63            "rst",
64            "ruby",
65            "rust",
66            "scala",
67            "swift",
68            "markdown",
69            "latex",
70            "html",
71            "sol",
72        ],
73    )
74
75    class Config(OneOfOptionConfig):
76        title = "By Programming Language"
77        description = "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
78        discriminator = "mode"

mode: Literal['code']

language: str

class CodeSplitterConfigModel.Config(airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig): View Source

75    class Config(OneOfOptionConfig):
76        title = "By Programming Language"
77        description = "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
78        discriminator = "mode"

Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.

Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).

Usage:

class OptionModel(BaseModel):
    mode: Literal["option_a"] = Field("option_a", const=True)
    option_a_field: str = Field(...)

    class Config(OneOfOptionConfig):
        title = "Option A"
        description = "Option A description"
        discriminator = "mode"

title = 'By Programming Language'

description = 'Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks.'

discriminator = 'mode'

Inherited Members

airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig: schema_extra

TextSplitterConfigModel = typing.Union[SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel]

class FieldNameMappingConfigModel(pydantic.v1.main.BaseModel): View Source

86class FieldNameMappingConfigModel(BaseModel):
87    from_field: str = Field(title="From field name", description="The field name in the source")
88    to_field: str = Field(
89        title="To field name", description="The field name to use in the destination"
90    )

from_field: str

to_field: str

class ProcessingConfigModel(pydantic.v1.main.BaseModel): View Source

 93class ProcessingConfigModel(BaseModel):
 94    chunk_size: int = Field(
 95        ...,
 96        title="Chunk size",
 97        maximum=8191,
 98        minimum=1,
 99        description="Size of chunks in tokens to store in vector store (make sure it is not too big for the context if your LLM)",
100    )
101    chunk_overlap: int = Field(
102        title="Chunk overlap",
103        description="Size of overlap between chunks in tokens to store in vector store to better capture relevant context",
104        default=0,
105    )
106    text_fields: Optional[List[str]] = Field(
107        default=[],
108        title="Text fields to embed",
109        description="List of fields in the record that should be used to calculate the embedding. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered text fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array.",
110        always_show=True,
111        examples=["text", "user.name", "users.*.name"],
112    )
113    metadata_fields: Optional[List[str]] = Field(
114        default=[],
115        title="Fields to store as metadata",
116        description="List of fields in the record that should be stored as metadata. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered metadata fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array. When specifying nested paths, all matching values are flattened into an array set to a field named by the path.",
117        always_show=True,
118        examples=["age", "user", "user.name"],
119    )
120    text_splitter: TextSplitterConfigModel = Field(
121        default=None,
122        title="Text splitter",
123        discriminator="mode",
124        type="object",
125        description="Split text fields into chunks based on the specified method.",
126    )
127    field_name_mappings: Optional[List[FieldNameMappingConfigModel]] = Field(
128        default=[],
129        title="Field name mappings",
130        description="List of fields to rename. Not applicable for nested fields, but can be used to rename fields already flattened via dot notation.",
131    )
132
133    class Config:
134        schema_extra = {"group": "processing"}

chunk_size: int

chunk_overlap: int

text_fields: Optional[List[str]]

metadata_fields: Optional[List[str]]

text_splitter: Union[SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel]

field_name_mappings: Optional[List[FieldNameMappingConfigModel]]

class ProcessingConfigModel.Config: View Source

133    class Config:
134        schema_extra = {"group": "processing"}

schema_extra = {'group': 'processing'}

class OpenAIEmbeddingConfigModel(pydantic.v1.main.BaseModel): View Source

137class OpenAIEmbeddingConfigModel(BaseModel):
138    mode: Literal["openai"] = Field("openai", const=True)
139    openai_key: str = Field(..., title="OpenAI API key", airbyte_secret=True)
140
141    class Config(OneOfOptionConfig):
142        title = "OpenAI"
143        description = "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
144        discriminator = "mode"

mode: Literal['openai']

openai_key: str

class OpenAIEmbeddingConfigModel.Config(airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig): View Source

141    class Config(OneOfOptionConfig):
142        title = "OpenAI"
143        description = "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
144        discriminator = "mode"

Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.

Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).

Usage:

class OptionModel(BaseModel):
    mode: Literal["option_a"] = Field("option_a", const=True)
    option_a_field: str = Field(...)

    class Config(OneOfOptionConfig):
        title = "Option A"
        description = "Option A description"
        discriminator = "mode"

title = 'OpenAI'

description = 'Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions.'

discriminator = 'mode'

Inherited Members

airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig: schema_extra

class OpenAICompatibleEmbeddingConfigModel(pydantic.v1.main.BaseModel): View Source

147class OpenAICompatibleEmbeddingConfigModel(BaseModel):
148    mode: Literal["openai_compatible"] = Field("openai_compatible", const=True)
149    api_key: str = Field(title="API key", default="", airbyte_secret=True)
150    base_url: str = Field(
151        ...,
152        title="Base URL",
153        description="The base URL for your OpenAI-compatible service",
154        examples=["https://your-service-name.com"],
155    )
156    model_name: str = Field(
157        title="Model name",
158        description="The name of the model to use for embedding",
159        default="text-embedding-ada-002",
160        examples=["text-embedding-ada-002"],
161    )
162    dimensions: int = Field(
163        title="Embedding dimensions",
164        description="The number of dimensions the embedding model is generating",
165        examples=[1536, 384],
166    )
167
168    class Config(OneOfOptionConfig):
169        title = "OpenAI-compatible"
170        description = "Use a service that's compatible with the OpenAI API to embed text."
171        discriminator = "mode"

mode: Literal['openai_compatible']

api_key: str

base_url: str

model_name: str

dimensions: int

class OpenAICompatibleEmbeddingConfigModel.Config(airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig): View Source

168    class Config(OneOfOptionConfig):
169        title = "OpenAI-compatible"
170        description = "Use a service that's compatible with the OpenAI API to embed text."
171        discriminator = "mode"

Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.

Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).

Usage:

class OptionModel(BaseModel):
    mode: Literal["option_a"] = Field("option_a", const=True)
    option_a_field: str = Field(...)

    class Config(OneOfOptionConfig):
        title = "Option A"
        description = "Option A description"
        discriminator = "mode"

title = 'OpenAI-compatible'

description = "Use a service that's compatible with the OpenAI API to embed text."

discriminator = 'mode'

Inherited Members

airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig: schema_extra

class AzureOpenAIEmbeddingConfigModel(pydantic.v1.main.BaseModel): View Source

174class AzureOpenAIEmbeddingConfigModel(BaseModel):
175    mode: Literal["azure_openai"] = Field("azure_openai", const=True)
176    openai_key: str = Field(
177        ...,
178        title="Azure OpenAI API key",
179        airbyte_secret=True,
180        description="The API key for your Azure OpenAI resource.  You can find this in the Azure portal under your Azure OpenAI resource",
181    )
182    api_base: str = Field(
183        ...,
184        title="Resource base URL",
185        description="The base URL for your Azure OpenAI resource.  You can find this in the Azure portal under your Azure OpenAI resource",
186        examples=["https://your-resource-name.openai.azure.com"],
187    )
188    deployment: str = Field(
189        ...,
190        title="Deployment",
191        description="The deployment for your Azure OpenAI resource.  You can find this in the Azure portal under your Azure OpenAI resource",
192        examples=["your-resource-name"],
193    )
194
195    class Config(OneOfOptionConfig):
196        title = "Azure OpenAI"
197        description = "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
198        discriminator = "mode"

mode: Literal['azure_openai']

openai_key: str

api_base: str

deployment: str

class AzureOpenAIEmbeddingConfigModel.Config(airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig): View Source

195    class Config(OneOfOptionConfig):
196        title = "Azure OpenAI"
197        description = "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
198        discriminator = "mode"

Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.

Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).

Usage:

class OptionModel(BaseModel):
    mode: Literal["option_a"] = Field("option_a", const=True)
    option_a_field: str = Field(...)

    class Config(OneOfOptionConfig):
        title = "Option A"
        description = "Option A description"
        discriminator = "mode"

title = 'Azure OpenAI'

description = 'Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions.'

discriminator = 'mode'

Inherited Members

airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig: schema_extra

class FakeEmbeddingConfigModel(pydantic.v1.main.BaseModel): View Source

201class FakeEmbeddingConfigModel(BaseModel):
202    mode: Literal["fake"] = Field("fake", const=True)
203
204    class Config(OneOfOptionConfig):
205        title = "Fake"
206        description = "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs."
207        discriminator = "mode"

mode: Literal['fake']

class FakeEmbeddingConfigModel.Config(airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig): View Source

204    class Config(OneOfOptionConfig):
205        title = "Fake"
206        description = "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs."
207        discriminator = "mode"

Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.

Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).

Usage:

class OptionModel(BaseModel):
    mode: Literal["option_a"] = Field("option_a", const=True)
    option_a_field: str = Field(...)

    class Config(OneOfOptionConfig):
        title = "Option A"
        description = "Option A description"
        discriminator = "mode"

title = 'Fake'

description = 'Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs.'

discriminator = 'mode'

Inherited Members

airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig: schema_extra

class FromFieldEmbeddingConfigModel(pydantic.v1.main.BaseModel): View Source

210class FromFieldEmbeddingConfigModel(BaseModel):
211    mode: Literal["from_field"] = Field("from_field", const=True)
212    field_name: str = Field(
213        ...,
214        title="Field name",
215        description="Name of the field in the record that contains the embedding",
216        examples=["embedding", "vector"],
217    )
218    dimensions: int = Field(
219        ...,
220        title="Embedding dimensions",
221        description="The number of dimensions the embedding model is generating",
222        examples=[1536, 384],
223    )
224
225    class Config(OneOfOptionConfig):
226        title = "From Field"
227        description = "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
228        discriminator = "mode"

mode: Literal['from_field']

field_name: str

dimensions: int

class FromFieldEmbeddingConfigModel.Config(airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig): View Source

225    class Config(OneOfOptionConfig):
226        title = "From Field"
227        description = "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
228        discriminator = "mode"

Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.

Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).

Usage:

class OptionModel(BaseModel):
    mode: Literal["option_a"] = Field("option_a", const=True)
    option_a_field: str = Field(...)

    class Config(OneOfOptionConfig):
        title = "Option A"
        description = "Option A description"
        discriminator = "mode"

title = 'From Field'

description = 'Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store.'

discriminator = 'mode'

Inherited Members

airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig: schema_extra

class CohereEmbeddingConfigModel(pydantic.v1.main.BaseModel): View Source

231class CohereEmbeddingConfigModel(BaseModel):
232    mode: Literal["cohere"] = Field("cohere", const=True)
233    cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True)
234
235    class Config(OneOfOptionConfig):
236        title = "Cohere"
237        description = "Use the Cohere API to embed text."
238        discriminator = "mode"

mode: Literal['cohere']

cohere_key: str

class CohereEmbeddingConfigModel.Config(airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig): View Source

235    class Config(OneOfOptionConfig):
236        title = "Cohere"
237        description = "Use the Cohere API to embed text."
238        discriminator = "mode"

Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.

Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).

Usage:

class OptionModel(BaseModel):
    mode: Literal["option_a"] = Field("option_a", const=True)
    option_a_field: str = Field(...)

    class Config(OneOfOptionConfig):
        title = "Option A"
        description = "Option A description"
        discriminator = "mode"

title = 'Cohere'

description = 'Use the Cohere API to embed text.'

discriminator = 'mode'

Inherited Members

airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig: schema_extra

class VectorDBConfigModel(pydantic.v1.main.BaseModel): View Source

241class VectorDBConfigModel(BaseModel):
242    """
243    The configuration model for the Vector DB based destinations. This model is used to generate the UI for the destination configuration,
244    as well as to provide type safety for the configuration passed to the destination.
245
246    The configuration model is composed of four parts:
247    * Processing configuration
248    * Embedding configuration
249    * Indexing configuration
250    * Advanced configuration
251
252    Processing, embedding and advanced configuration are provided by this base class, while the indexing configuration is provided by the destination connector in the sub class.
253    """
254
255    embedding: Union[
256        OpenAIEmbeddingConfigModel,
257        CohereEmbeddingConfigModel,
258        FakeEmbeddingConfigModel,
259        AzureOpenAIEmbeddingConfigModel,
260        OpenAICompatibleEmbeddingConfigModel,
261    ] = Field(
262        ...,
263        title="Embedding",
264        description="Embedding configuration",
265        discriminator="mode",
266        group="embedding",
267        type="object",
268    )
269    processing: ProcessingConfigModel
270    omit_raw_text: bool = Field(
271        default=False,
272        title="Do not store raw text",
273        group="advanced",
274        description="Do not store the text that gets embedded along with the vector and the metadata in the destination. If set to true, only the vector and the metadata will be stored - in this case raw text for LLM use cases needs to be retrieved from another source.",
275    )
276
277    class Config:
278        title = "Destination Config"
279        schema_extra = {
280            "groups": [
281                {"id": "processing", "title": "Processing"},
282                {"id": "embedding", "title": "Embedding"},
283                {"id": "indexing", "title": "Indexing"},
284                {"id": "advanced", "title": "Advanced"},
285            ]
286        }
287
288    @staticmethod
289    def remove_discriminator(schema: Dict[str, Any]) -> None:
290        """pydantic adds "discriminator" to the schema for oneOfs, which is not treated right by the platform as we inline all references"""
291        dpath.delete(schema, "properties/**/discriminator")
292
293    @classmethod
294    def schema(cls, by_alias: bool = True, ref_template: str = "") -> Dict[str, Any]:
295        """we're overriding the schema classmethod to enable some post-processing"""
296        schema: Dict[str, Any] = super().schema()
297        schema = resolve_refs(schema)
298        cls.remove_discriminator(schema)
299        return schema

The configuration model for the Vector DB based destinations. This model is used to generate the UI for the destination configuration, as well as to provide type safety for the configuration passed to the destination.

The configuration model is composed of four parts:

Processing configuration
Embedding configuration
Indexing configuration
Advanced configuration

Processing, embedding and advanced configuration are provided by this base class, while the indexing configuration is provided by the destination connector in the sub class.

embedding: Union[OpenAIEmbeddingConfigModel, CohereEmbeddingConfigModel, FakeEmbeddingConfigModel, AzureOpenAIEmbeddingConfigModel, OpenAICompatibleEmbeddingConfigModel]

processing: ProcessingConfigModel

omit_raw_text: bool

@staticmethod

def remove_discriminator(schema: Dict[str, Any]) -> None: View Source

288    @staticmethod
289    def remove_discriminator(schema: Dict[str, Any]) -> None:
290        """pydantic adds "discriminator" to the schema for oneOfs, which is not treated right by the platform as we inline all references"""
291        dpath.delete(schema, "properties/**/discriminator")

pydantic adds "discriminator" to the schema for oneOfs, which is not treated right by the platform as we inline all references

@classmethod

def schema(cls, by_alias: bool = True, ref_template: str = '') -> Dict[str, Any]: View Source

293    @classmethod
294    def schema(cls, by_alias: bool = True, ref_template: str = "") -> Dict[str, Any]:
295        """we're overriding the schema classmethod to enable some post-processing"""
296        schema: Dict[str, Any] = super().schema()
297        schema = resolve_refs(schema)
298        cls.remove_discriminator(schema)
299        return schema

we're overriding the schema classmethod to enable some post-processing

class VectorDBConfigModel.Config: View Source

277    class Config:
278        title = "Destination Config"
279        schema_extra = {
280            "groups": [
281                {"id": "processing", "title": "Processing"},
282                {"id": "embedding", "title": "Embedding"},
283                {"id": "indexing", "title": "Indexing"},
284                {"id": "advanced", "title": "Advanced"},
285            ]
286        }

title = 'Destination Config'

schema_extra = {'groups': [{'id': 'processing', 'title': 'Processing'}, {'id': 'embedding', 'title': 'Embedding'}, {'id': 'indexing', 'title': 'Indexing'}, {'id': 'advanced', 'title': 'Advanced'}]}