airbyte_cdk.sources.file_based.config.unstructured_format
1# 2# Copyright (c) 2023 Airbyte, Inc., all rights reserved. 3# 4 5from typing import List, Literal, Optional, Union 6 7from pydantic.v1 import BaseModel, Field 8 9from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig 10 11 12class LocalProcessingConfigModel(BaseModel): 13 mode: Literal["local"] = Field("local", const=True) 14 15 class Config(OneOfOptionConfig): 16 title = "Local" 17 description = ( 18 "Process files locally, supporting `fast` and `ocr` modes. This is the default option." 19 ) 20 discriminator = "mode" 21 22 23class APIParameterConfigModel(BaseModel): 24 name: str = Field( 25 title="Parameter name", 26 description="The name of the unstructured API parameter to use", 27 examples=["combine_under_n_chars", "languages"], 28 ) 29 value: str = Field( 30 title="Value", description="The value of the parameter", examples=["true", "hi_res"] 31 ) 32 33 34class APIProcessingConfigModel(BaseModel): 35 mode: Literal["api"] = Field("api", const=True) 36 37 api_key: str = Field( 38 default="", 39 always_show=True, 40 title="API Key", 41 airbyte_secret=True, 42 description="The API key to use matching the environment", 43 ) 44 45 api_url: str = Field( 46 default="https://api.unstructured.io", 47 title="API URL", 48 always_show=True, 49 description="The URL of the unstructured API to use", 50 examples=["https://api.unstructured.com"], 51 ) 52 53 parameters: Optional[List[APIParameterConfigModel]] = Field( 54 default=[], 55 always_show=True, 56 title="Additional URL Parameters", 57 description="List of parameters send to the API", 58 ) 59 60 class Config(OneOfOptionConfig): 61 title = "via API" 62 description = "Process files via an API, using the `hi_res` mode. This option is useful for increased performance and accuracy, but requires an API key and a hosted instance of unstructured." 63 discriminator = "mode" 64 65 66class UnstructuredFormat(BaseModel): 67 class Config(OneOfOptionConfig): 68 title = "Unstructured Document Format" 69 description = "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file." 70 discriminator = "filetype" 71 72 filetype: str = Field( 73 "unstructured", 74 const=True, 75 ) 76 77 skip_unprocessable_files: bool = Field( 78 default=True, 79 title="Skip Unprocessable Files", 80 description="If true, skip files that cannot be parsed and pass the error message along as the _ab_source_file_parse_error field. If false, fail the sync.", 81 always_show=True, 82 ) 83 84 strategy: str = Field( 85 always_show=True, 86 order=0, 87 default="auto", 88 title="Parsing Strategy", 89 enum=["auto", "fast", "ocr_only", "hi_res"], 90 description="The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf", 91 ) 92 93 processing: Union[ 94 LocalProcessingConfigModel, 95 APIProcessingConfigModel, 96 ] = Field( 97 default=LocalProcessingConfigModel(mode="local"), 98 title="Processing", 99 description="Processing configuration", 100 discriminator="mode", 101 type="object", 102 )
13class LocalProcessingConfigModel(BaseModel): 14 mode: Literal["local"] = Field("local", const=True) 15 16 class Config(OneOfOptionConfig): 17 title = "Local" 18 description = ( 19 "Process files locally, supporting `fast` and `ocr` modes. This is the default option." 20 ) 21 discriminator = "mode"
16 class Config(OneOfOptionConfig): 17 title = "Local" 18 description = ( 19 "Process files locally, supporting `fast` and `ocr` modes. This is the default option." 20 ) 21 discriminator = "mode"
Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.
Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).
Usage:
class OptionModel(BaseModel): mode: Literal["option_a"] = Field("option_a", const=True) option_a_field: str = Field(...) class Config(OneOfOptionConfig): title = "Option A" description = "Option A description" discriminator = "mode"
Inherited Members
24class APIParameterConfigModel(BaseModel): 25 name: str = Field( 26 title="Parameter name", 27 description="The name of the unstructured API parameter to use", 28 examples=["combine_under_n_chars", "languages"], 29 ) 30 value: str = Field( 31 title="Value", description="The value of the parameter", examples=["true", "hi_res"] 32 )
35class APIProcessingConfigModel(BaseModel): 36 mode: Literal["api"] = Field("api", const=True) 37 38 api_key: str = Field( 39 default="", 40 always_show=True, 41 title="API Key", 42 airbyte_secret=True, 43 description="The API key to use matching the environment", 44 ) 45 46 api_url: str = Field( 47 default="https://api.unstructured.io", 48 title="API URL", 49 always_show=True, 50 description="The URL of the unstructured API to use", 51 examples=["https://api.unstructured.com"], 52 ) 53 54 parameters: Optional[List[APIParameterConfigModel]] = Field( 55 default=[], 56 always_show=True, 57 title="Additional URL Parameters", 58 description="List of parameters send to the API", 59 ) 60 61 class Config(OneOfOptionConfig): 62 title = "via API" 63 description = "Process files via an API, using the `hi_res` mode. This option is useful for increased performance and accuracy, but requires an API key and a hosted instance of unstructured." 64 discriminator = "mode"
61 class Config(OneOfOptionConfig): 62 title = "via API" 63 description = "Process files via an API, using the `hi_res` mode. This option is useful for increased performance and accuracy, but requires an API key and a hosted instance of unstructured." 64 discriminator = "mode"
Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.
Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).
Usage:
class OptionModel(BaseModel): mode: Literal["option_a"] = Field("option_a", const=True) option_a_field: str = Field(...) class Config(OneOfOptionConfig): title = "Option A" description = "Option A description" discriminator = "mode"
Inherited Members
67class UnstructuredFormat(BaseModel): 68 class Config(OneOfOptionConfig): 69 title = "Unstructured Document Format" 70 description = "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file." 71 discriminator = "filetype" 72 73 filetype: str = Field( 74 "unstructured", 75 const=True, 76 ) 77 78 skip_unprocessable_files: bool = Field( 79 default=True, 80 title="Skip Unprocessable Files", 81 description="If true, skip files that cannot be parsed and pass the error message along as the _ab_source_file_parse_error field. If false, fail the sync.", 82 always_show=True, 83 ) 84 85 strategy: str = Field( 86 always_show=True, 87 order=0, 88 default="auto", 89 title="Parsing Strategy", 90 enum=["auto", "fast", "ocr_only", "hi_res"], 91 description="The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf", 92 ) 93 94 processing: Union[ 95 LocalProcessingConfigModel, 96 APIProcessingConfigModel, 97 ] = Field( 98 default=LocalProcessingConfigModel(mode="local"), 99 title="Processing", 100 description="Processing configuration", 101 discriminator="mode", 102 type="object", 103 )
68 class Config(OneOfOptionConfig): 69 title = "Unstructured Document Format" 70 description = "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file." 71 discriminator = "filetype"
Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.
Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).
Usage:
class OptionModel(BaseModel): mode: Literal["option_a"] = Field("option_a", const=True) option_a_field: str = Field(...) class Config(OneOfOptionConfig): title = "Option A" description = "Option A description" discriminator = "mode"