airbyte_cdk.sources.file_based.config.unstructured_format

  1#
  2# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
  3#
  4
  5from typing import List, Literal, Optional, Union
  6
  7from pydantic.v1 import BaseModel, Field
  8
  9from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 10
 11
 12class LocalProcessingConfigModel(BaseModel):
 13    mode: Literal["local"] = Field("local", const=True)
 14
 15    class Config(OneOfOptionConfig):
 16        title = "Local"
 17        description = (
 18            "Process files locally, supporting `fast` and `ocr` modes. This is the default option."
 19        )
 20        discriminator = "mode"
 21
 22
 23class APIParameterConfigModel(BaseModel):
 24    name: str = Field(
 25        title="Parameter name",
 26        description="The name of the unstructured API parameter to use",
 27        examples=["combine_under_n_chars", "languages"],
 28    )
 29    value: str = Field(
 30        title="Value", description="The value of the parameter", examples=["true", "hi_res"]
 31    )
 32
 33
 34class APIProcessingConfigModel(BaseModel):
 35    mode: Literal["api"] = Field("api", const=True)
 36
 37    api_key: str = Field(
 38        default="",
 39        always_show=True,
 40        title="API Key",
 41        airbyte_secret=True,
 42        description="The API key to use matching the environment",
 43    )
 44
 45    api_url: str = Field(
 46        default="https://api.unstructured.io",
 47        title="API URL",
 48        always_show=True,
 49        description="The URL of the unstructured API to use",
 50        examples=["https://api.unstructured.com"],
 51    )
 52
 53    parameters: Optional[List[APIParameterConfigModel]] = Field(
 54        default=[],
 55        always_show=True,
 56        title="Additional URL Parameters",
 57        description="List of parameters send to the API",
 58    )
 59
 60    class Config(OneOfOptionConfig):
 61        title = "via API"
 62        description = "Process files via an API, using the `hi_res` mode. This option is useful for increased performance and accuracy, but requires an API key and a hosted instance of unstructured."
 63        discriminator = "mode"
 64
 65
 66class UnstructuredFormat(BaseModel):
 67    class Config(OneOfOptionConfig):
 68        title = "Unstructured Document Format"
 69        description = "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."
 70        discriminator = "filetype"
 71
 72    filetype: str = Field(
 73        "unstructured",
 74        const=True,
 75    )
 76
 77    skip_unprocessable_files: bool = Field(
 78        default=True,
 79        title="Skip Unprocessable Files",
 80        description="If true, skip files that cannot be parsed and pass the error message along as the _ab_source_file_parse_error field. If false, fail the sync.",
 81        always_show=True,
 82    )
 83
 84    strategy: str = Field(
 85        always_show=True,
 86        order=0,
 87        default="auto",
 88        title="Parsing Strategy",
 89        enum=["auto", "fast", "ocr_only", "hi_res"],
 90        description="The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf",
 91    )
 92
 93    processing: Union[
 94        LocalProcessingConfigModel,
 95        APIProcessingConfigModel,
 96    ] = Field(
 97        default=LocalProcessingConfigModel(mode="local"),
 98        title="Processing",
 99        description="Processing configuration",
100        discriminator="mode",
101        type="object",
102    )
class LocalProcessingConfigModel(pydantic.v1.main.BaseModel):
13class LocalProcessingConfigModel(BaseModel):
14    mode: Literal["local"] = Field("local", const=True)
15
16    class Config(OneOfOptionConfig):
17        title = "Local"
18        description = (
19            "Process files locally, supporting `fast` and `ocr` modes. This is the default option."
20        )
21        discriminator = "mode"
mode: Literal['local']
class LocalProcessingConfigModel.Config(airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig):
16    class Config(OneOfOptionConfig):
17        title = "Local"
18        description = (
19            "Process files locally, supporting `fast` and `ocr` modes. This is the default option."
20        )
21        discriminator = "mode"

Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.

Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).

Usage:
class OptionModel(BaseModel):
    mode: Literal["option_a"] = Field("option_a", const=True)
    option_a_field: str = Field(...)

    class Config(OneOfOptionConfig):
        title = "Option A"
        description = "Option A description"
        discriminator = "mode"
title = 'Local'
description = 'Process files locally, supporting `fast` and `ocr` modes. This is the default option.'
discriminator = 'mode'
class APIParameterConfigModel(pydantic.v1.main.BaseModel):
24class APIParameterConfigModel(BaseModel):
25    name: str = Field(
26        title="Parameter name",
27        description="The name of the unstructured API parameter to use",
28        examples=["combine_under_n_chars", "languages"],
29    )
30    value: str = Field(
31        title="Value", description="The value of the parameter", examples=["true", "hi_res"]
32    )
name: str
value: str
class APIProcessingConfigModel(pydantic.v1.main.BaseModel):
35class APIProcessingConfigModel(BaseModel):
36    mode: Literal["api"] = Field("api", const=True)
37
38    api_key: str = Field(
39        default="",
40        always_show=True,
41        title="API Key",
42        airbyte_secret=True,
43        description="The API key to use matching the environment",
44    )
45
46    api_url: str = Field(
47        default="https://api.unstructured.io",
48        title="API URL",
49        always_show=True,
50        description="The URL of the unstructured API to use",
51        examples=["https://api.unstructured.com"],
52    )
53
54    parameters: Optional[List[APIParameterConfigModel]] = Field(
55        default=[],
56        always_show=True,
57        title="Additional URL Parameters",
58        description="List of parameters send to the API",
59    )
60
61    class Config(OneOfOptionConfig):
62        title = "via API"
63        description = "Process files via an API, using the `hi_res` mode. This option is useful for increased performance and accuracy, but requires an API key and a hosted instance of unstructured."
64        discriminator = "mode"
mode: Literal['api']
api_key: str
api_url: str
parameters: Optional[List[APIParameterConfigModel]]
class APIProcessingConfigModel.Config(airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig):
61    class Config(OneOfOptionConfig):
62        title = "via API"
63        description = "Process files via an API, using the `hi_res` mode. This option is useful for increased performance and accuracy, but requires an API key and a hosted instance of unstructured."
64        discriminator = "mode"

Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.

Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).

Usage:
class OptionModel(BaseModel):
    mode: Literal["option_a"] = Field("option_a", const=True)
    option_a_field: str = Field(...)

    class Config(OneOfOptionConfig):
        title = "Option A"
        description = "Option A description"
        discriminator = "mode"
title = 'via API'
description = 'Process files via an API, using the `hi_res` mode. This option is useful for increased performance and accuracy, but requires an API key and a hosted instance of unstructured.'
discriminator = 'mode'
class UnstructuredFormat(pydantic.v1.main.BaseModel):
 67class UnstructuredFormat(BaseModel):
 68    class Config(OneOfOptionConfig):
 69        title = "Unstructured Document Format"
 70        description = "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."
 71        discriminator = "filetype"
 72
 73    filetype: str = Field(
 74        "unstructured",
 75        const=True,
 76    )
 77
 78    skip_unprocessable_files: bool = Field(
 79        default=True,
 80        title="Skip Unprocessable Files",
 81        description="If true, skip files that cannot be parsed and pass the error message along as the _ab_source_file_parse_error field. If false, fail the sync.",
 82        always_show=True,
 83    )
 84
 85    strategy: str = Field(
 86        always_show=True,
 87        order=0,
 88        default="auto",
 89        title="Parsing Strategy",
 90        enum=["auto", "fast", "ocr_only", "hi_res"],
 91        description="The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf",
 92    )
 93
 94    processing: Union[
 95        LocalProcessingConfigModel,
 96        APIProcessingConfigModel,
 97    ] = Field(
 98        default=LocalProcessingConfigModel(mode="local"),
 99        title="Processing",
100        description="Processing configuration",
101        discriminator="mode",
102        type="object",
103    )
filetype: str
skip_unprocessable_files: bool
strategy: str
class UnstructuredFormat.Config(airbyte_cdk.utils.oneof_option_config.OneOfOptionConfig):
68    class Config(OneOfOptionConfig):
69        title = "Unstructured Document Format"
70        description = "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."
71        discriminator = "filetype"

Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.

Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).

Usage:
class OptionModel(BaseModel):
    mode: Literal["option_a"] = Field("option_a", const=True)
    option_a_field: str = Field(...)

    class Config(OneOfOptionConfig):
        title = "Option A"
        description = "Option A description"
        discriminator = "mode"
title = 'Unstructured Document Format'
description = 'Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file.'
discriminator = 'filetype'