from __future__ import annotations import contextlib from typing import IO, Any, Optional, Sequence import requests from unstructured_client import UnstructuredClient from unstructured_client.models import operations, shared from unstructured_client.utils import retries from unstructured.documents.elements import Element from unstructured.logger import logger from unstructured.partition.common.common import exactly_one from unstructured.staging.base import elements_from_dicts, elements_from_json # Default retry configuration taken from the client code DEFAULT_RETRIES_INITIAL_INTERVAL_SEC = 3000 DEFAULT_RETRIES_MAX_INTERVAL_SEC = 720000 DEFAULT_RETRIES_EXPONENT = 1.5 DEFAULT_RETRIES_MAX_ELAPSED_TIME_SEC = 1800000 DEFAULT_RETRIES_CONNECTION_ERRORS = True def partition_via_api( filename: Optional[str] = None, content_type: Optional[str] = None, file: Optional[IO[bytes]] = None, file_filename: Optional[str] = None, api_url: str = "https://api.unstructured.io/general/v0/general", api_key: str = "", metadata_filename: Optional[str] = None, retries_initial_interval: [int] = None, retries_max_interval: Optional[int] = None, retries_exponent: Optional[float] = None, retries_max_elapsed_time: Optional[int] = None, retries_connection_errors: Optional[bool] = None, **request_kwargs: Any, ) -> list[Element]: """Partitions a document using the Unstructured REST API. This is equivalent to running the document through partition. See https://api.unstructured.io/general/docs for the hosted API documentation or https://github.com/Unstructured-IO/unstructured-api for instructions on how to run the API locally as a container. Parameters ---------- filename A string defining the target filename path. content_type A string defining the file content in MIME type file A file-like object using "rb" mode --> open(filename, "rb"). metadata_filename When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt" api_url The URL for the Unstructured API. Defaults to the hosted Unstructured API. api_key The API key to pass to the Unstructured API. retries_initial_interval Defines the time interval (in seconds) to wait before the first retry in case of a request failure. Defaults to 3000. If set should be > 0. retries_max_interval Defines the maximum time interval (in seconds) to wait between retries (the interval between retries is increased as using exponential increase algorithm - this setting limits it). Defaults to 720000. If set should be > 0. retries_exponent Defines the exponential factor to increase the interval between retries. Defaults to 1.5. If set should be > 0.0. retries_max_elapsed_time Defines the maximum time (in seconds) to wait for retries. If exceeded, the original exception is raised. Defaults to 1800000. If set should be > 0. retries_connection_errors Defines whether to retry on connection errors. Defaults to True. request_kwargs Additional parameters to pass to the data field of the request to the Unstructured API. For example the `strategy` parameter. """ exactly_one(filename=filename, file=file) if metadata_filename and file_filename: raise ValueError( "Only one of metadata_filename and file_filename is specified. " "metadata_filename is preferred. file_filename is marked for deprecation.", ) if file_filename is not None: metadata_filename = file_filename logger.warn( "The file_filename kwarg will be deprecated in a future version of unstructured. " "Please use metadata_filename instead.", ) # Note(austin) - the sdk takes the base url, but we have the full api_url # For consistency, just strip off the path when it's given base_url = api_url[:-19] if "/general/v0/general" in api_url else api_url sdk = UnstructuredClient(api_key_auth=api_key, server_url=base_url) if filename is not None: with open(filename, "rb") as f: files = shared.Files( content=f.read(), file_name=filename, ) elif file is not None: if metadata_filename is None: raise ValueError( "If file is specified in partition_via_api, " "metadata_filename must be specified as well.", ) files = shared.Files(content=file, file_name=metadata_filename) req = operations.PartitionRequest( partition_parameters=shared.PartitionParameters(files=files, **request_kwargs) ) retries_config = get_retries_config( retries_connection_errors=retries_connection_errors, retries_exponent=retries_exponent, retries_initial_interval=retries_initial_interval, retries_max_elapsed_time=retries_max_elapsed_time, retries_max_interval=retries_max_interval, sdk=sdk, ) response = sdk.general.partition( request=req, retries=retries_config, ) if response.status_code == 200: return elements_from_json(text=response.raw_response.text) else: raise ValueError( f"Receive unexpected status code {response.status_code} from the API.", ) def get_retries_config( retries_connection_errors: Optional[bool], retries_exponent: Optional[float], retries_initial_interval: Optional[int], retries_max_elapsed_time: Optional[int], retries_max_interval: Optional[int], sdk: UnstructuredClient, ) -> Optional[retries.RetryConfig]: """Constructs a RetryConfig object from the provided parameters. If any of the parameters are None, the default values are taken from the SDK configuration or the default constants. If all parameters are None, returns None (and the SDK-managed defaults are used within the client) The solution is not perfect as the RetryConfig object does not include the defaults by itself so we might need to construct it basing on our defaults. Parameters ---------- retries_connection_errors Defines whether to retry on connection errors. If not set the DEFAULT_RETRIES_CONNECTION_ERRORS constant is used. retries_exponent Defines the exponential factor to increase the interval between retries. If set, should be > 0.0 (otherwise the DEFAULT_RETRIES_EXPONENT constant is used) retries_initial_interval Defines the time interval to wait before the first retry in case of a request failure. If set, should be > 0 (otherwise the DEFAULT_RETRIES_INITIAL_INTERVAL_SEC constant is used) retries_max_elapsed_time Defines the maximum time to wait for retries. If exceeded, the original exception is raised. If set, should be > 0 (otherwise the DEFAULT_RETRIES_MAX_ELAPSED_TIME_SEC constant is used) retries_max_interval Defines the maximum time interval to wait between retries. If set, should be > 0 (otherwise the DEFAULT_RETRIES_MAX_INTERVAL_SEC constant is used) sdk The UnstructuredClient object to take the default values from. """ retries_config = None sdk_default_retries_config = sdk.sdk_configuration.retry_config if any( setting is not None for setting in ( retries_initial_interval, retries_max_interval, retries_exponent, retries_max_elapsed_time, retries_connection_errors, ) ): def get_backoff_default(setting_name: str, default_value: Any) -> Any: if sdk_default_retries_config: # noqa: SIM102 if setting_value := getattr(sdk_default_retries_config.backoff, setting_name): return setting_value return default_value default_retries_connneciton_errors = ( sdk_default_retries_config.retry_connection_errors if sdk_default_retries_config and sdk_default_retries_config.retry_connection_errors is not None else DEFAULT_RETRIES_CONNECTION_ERRORS ) backoff_strategy = retries.BackoffStrategy( initial_interval=( retries_initial_interval or get_backoff_default("initial_interval", DEFAULT_RETRIES_INITIAL_INTERVAL_SEC) ), max_interval=( retries_max_interval or get_backoff_default("max_interval", DEFAULT_RETRIES_MAX_INTERVAL_SEC) ), exponent=( retries_exponent or get_backoff_default("exponent", DEFAULT_RETRIES_EXPONENT) ), max_elapsed_time=( retries_max_elapsed_time or get_backoff_default("max_elapsed_time", DEFAULT_RETRIES_MAX_ELAPSED_TIME_SEC) ), ) retries_config = retries.RetryConfig( strategy="backoff", backoff=backoff_strategy, retry_connection_errors=( retries_connection_errors if retries_connection_errors is not None else default_retries_connneciton_errors ), ) return retries_config def partition_multiple_via_api( filenames: Optional[list[str]] = None, content_types: Optional[list[str]] = None, files: Optional[Sequence[IO[bytes]]] = None, file_filenames: Optional[list[str]] = None, api_url: str = "https://api.unstructured.io/general/v0/general", api_key: str = "", metadata_filenames: Optional[list[str]] = None, **request_kwargs: Any, ) -> list[list[Element]]: """Partitions multiple documents using the Unstructured REST API by batching the documents into a single HTTP request. See https://api.unstructured.io/general/docs for the hosted API documentation or https://github.com/Unstructured-IO/unstructured-api for instructions on how to run the API locally as a container. Parameters ---------- filenames A list of strings defining the target filename paths. content_types A list of strings defining the file contents in MIME types. files A list of file-like object using "rb" mode --> open(filename, "rb"). metadata_filename When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt" api_url The URL for the Unstructured API. Defaults to the hosted Unstructured API. api_key The API key to pass to the Unstructured API. request_kwargs Additional parameters to pass to the data field of the request to the Unstructured API. For example the `strategy` parameter. """ headers = { "ACCEPT": "application/json", "UNSTRUCTURED-API-KEY": api_key, } if metadata_filenames and file_filenames: raise ValueError( "Only one of metadata_filenames and file_filenames is specified. " "metadata_filenames is preferred. file_filenames is marked for deprecation.", ) if file_filenames is not None: metadata_filenames = file_filenames logger.warn( "The file_filenames kwarg will be deprecated in a future version of unstructured. " "Please use metadata_filenames instead.", ) if filenames is not None: if content_types and len(content_types) != len(filenames): raise ValueError("content_types and filenames must have the same length.") with contextlib.ExitStack() as stack: files = [stack.enter_context(open(f, "rb")) for f in filenames] # type: ignore _files = [] for i, file in enumerate(files): filename = filenames[i] content_type = content_types[i] if content_types is not None else None _files.append(("files", (filename, file, content_type))) response = requests.post( api_url, headers=headers, data=request_kwargs, files=_files, # type: ignore ) elif files is not None: if content_types and len(content_types) != len(files): raise ValueError("content_types and files must have the same length.") if not metadata_filenames: raise ValueError("metadata_filenames must be specified if files are passed") elif len(metadata_filenames) != len(files): raise ValueError("metadata_filenames and files must have the same length.") _files = [] for i, _file in enumerate(files): # type: ignore content_type = content_types[i] if content_types is not None else None filename = metadata_filenames[i] _files.append(("files", (filename, _file, content_type))) response = requests.post( api_url, headers=headers, data=request_kwargs, files=_files, # type: ignore ) if response.status_code == 200: documents = [] response_list = response.json() # NOTE(robinson) - this check is because if only one filename is passed, the return # type from the API is a list of objects instead of a list of lists if not isinstance(response_list[0], list): response_list = [response_list] for document in response_list: documents.append(elements_from_dicts(document)) return documents else: raise ValueError( f"Receive unexpected status code {response.status_code} from the API.", )