from __future__ import annotations
from typing import IO, Any, Optional
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import add_metadata
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.lang import check_language_args
from unstructured.partition.pdf import partition_pdf_or_image
from unstructured.partition.utils.constants import PartitionStrategy
@process_metadata()
@add_metadata
@add_chunking_strategy
def partition_image(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
include_page_breaks: bool = False,
infer_table_structure: bool = False,
ocr_languages: Optional[str] = None,
languages: Optional[list[str]] = None,
strategy: str = PartitionStrategy.HI_RES,
metadata_last_modified: Optional[str] = None,
chunking_strategy: Optional[str] = None,
hi_res_model_name: Optional[str] = None,
extract_images_in_pdf: bool = False,
extract_image_block_types: Optional[list[str]] = None,
extract_image_block_output_dir: Optional[str] = None,
extract_image_block_to_payload: bool = False,
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
**kwargs: Any,
) -> list[Element]:
"""Parses an image into a list of interpreted elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object as bytes --> open(filename, "rb").
include_page_breaks
If True, includes page breaks at the end of each page in the document.
infer_table_structure
Only applicable if `strategy=hi_res`.
If True, any Table elements that are extracted will also have a metadata field
named "text_as_html" where the table's text content is rendered into an html string.
I.e., rows and cells are preserved.
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
languages
The languages present in the document, for use in partitioning and/or OCR. To use a language
with Tesseract, you'll first need to install the appropriate Tesseract language pack.
strategy
The strategy to use for partitioning the image. Valid strategies are "hi_res" and
"ocr_only". When using the "hi_res" strategy, the function uses a layout detection
model if to identify document elements. When using the "ocr_only" strategy,
partition_image simply extracts the text from the document using OCR and processes it.
The default strategy is `hi_res`.
metadata_last_modified
The last modified date for the document.
hi_res_model_name
The layout detection model used when partitioning strategy is set to `hi_res`.
extract_images_in_pdf
Only applicable if `strategy=hi_res`.
If True, any detected images will be saved in the path specified by
'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
Deprecation Note: This parameter is marked for deprecation. Future versions will use
'extract_image_block_types' for broader extraction capabilities.
extract_image_block_types
Only applicable if `strategy=hi_res`.
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
saved in the path specified by 'extract_image_block_output_dir' or stored as base64 encoded
data within metadata fields.
extract_image_block_to_payload
Only applicable if `strategy=hi_res`.
If True, images of the element type(s) defined in 'extract_image_block_types' will be
encoded as base64 data and stored in two metadata fields: 'image_base64' and
'image_mime_type'.
This parameter facilitates the inclusion of element data directly within the payload,
especially for web-based applications or APIs.
extract_image_block_output_dir
Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
The filesystem path for saving images of the element type(s)
specified in 'extract_image_block_types'.
extract_forms
Whether the form extraction logic should be run
(results in adding FormKeysValues elements to output).
form_extraction_skip_tables
Whether the form extraction logic should ignore regions designated as Tables.
"""
exactly_one(filename=filename, file=file)
languages = check_language_args(languages or [], ocr_languages)
return partition_pdf_or_image(
filename=filename,
file=file,
is_image=True,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
languages=languages,
strategy=strategy,
metadata_last_modified=metadata_last_modified,
hi_res_model_name=hi_res_model_name,
extract_images_in_pdf=extract_images_in_pdf,
extract_image_block_types=extract_image_block_types,
extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload,
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
**kwargs,
)