from __future__ import annotations
import os
import tempfile
from typing import IO, TYPE_CHECKING, Any, List, Optional, cast
import numpy as np
import pdf2image
# NOTE(yuming): Rename PIL.Image to avoid conflict with
# unstructured.documents.elements.Image
from PIL import Image as PILImage
from PIL import ImageSequence
from unstructured.documents.elements import ElementType
from unstructured.metrics.table.table_formats import SimpleTableCell
from unstructured.partition.pdf_image.analysis.layout_dump import OCRLayoutDumper
from unstructured.partition.pdf_image.pdf_image_utils import valid_text
from unstructured.partition.pdf_image.pdfminer_processing import (
aggregate_embedded_text_by_block,
bboxes1_is_almost_subregion_of_bboxes2,
)
from unstructured.partition.utils.config import env_config
from unstructured.partition.utils.constants import OCRMode
from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent
from unstructured.utils import requires_dependencies
if TYPE_CHECKING:
from unstructured_inference.inference.elements import TextRegion, TextRegions
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
from unstructured_inference.inference.layoutelement import LayoutElement, LayoutElements
from unstructured_inference.models.tables import UnstructuredTableTransformerModel
def process_data_with_ocr(
data: bytes | IO[bytes],
out_layout: "DocumentLayout",
extracted_layout: List[List["TextRegion"]],
is_image: bool = False,
infer_table_structure: bool = False,
ocr_languages: str = "eng",
ocr_mode: str = OCRMode.FULL_PAGE.value,
pdf_image_dpi: int = 200,
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
) -> "DocumentLayout":
"""
Process OCR data from a given data and supplement the output DocumentLayout
from unstructured_inference with ocr.
Parameters:
- data (Union[bytes, BinaryIO]): The input file data,
which can be either bytes or a BinaryIO object.
- out_layout (DocumentLayout): The output layout from unstructured-inference.
- is_image (bool, optional): Indicates if the input data is an image (True) or not (False).
Defaults to False.
- infer_table_structure (bool, optional): If true, extract the table content.
- ocr_languages (str, optional): The languages for OCR processing. Defaults to "eng" (English).
- ocr_mode (str, optional): The OCR processing mode, e.g., "entire_page" or "individual_blocks".
Defaults to "entire_page". If choose "entire_page" OCR, OCR processes the entire image
page and will be merged with the output layout. If choose "individual_blocks" OCR,
OCR is performed on individual elements by cropping the image.
- pdf_image_dpi (int, optional): DPI (dots per inch) for processing PDF images. Defaults to 200.
- ocr_layout_dumper (OCRLayoutDumper, optional): The OCR layout dumper to save the OCR layout.
Returns:
DocumentLayout: The merged layout information obtained after OCR processing.
"""
data_bytes = data if isinstance(data, bytes) else data.read()
with tempfile.TemporaryDirectory() as tmp_dir_path:
tmp_file_path = os.path.join(tmp_dir_path, "tmp_file")
with open(tmp_file_path, "wb") as tmp_file:
tmp_file.write(data_bytes)
merged_layouts = process_file_with_ocr(
filename=tmp_file_path,
out_layout=out_layout,
extracted_layout=extracted_layout,
is_image=is_image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
return merged_layouts
@requires_dependencies("unstructured_inference")
def process_file_with_ocr(
filename: str,
out_layout: "DocumentLayout",
extracted_layout: List[TextRegions],
is_image: bool = False,
infer_table_structure: bool = False,
ocr_languages: str = "eng",
ocr_mode: str = OCRMode.FULL_PAGE.value,
pdf_image_dpi: int = 200,
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
) -> "DocumentLayout":
"""
Process OCR data from a given file and supplement the output DocumentLayout
from unstructured-inference with ocr.
Parameters:
- filename (str): The path to the input file, which can be an image or a PDF.
- out_layout (DocumentLayout): The output layout from unstructured-inference.
- extracted_layout (List[TextRegions]): a list of text regions extracted by pdfminer, one for
each page
- is_image (bool, optional): Indicates if the input data is an image (True) or not (False).
Defaults to False.
- infer_table_structure (bool, optional): If true, extract the table content.
- ocr_languages (str, optional): The languages for OCR processing. Defaults to "eng" (English).
- ocr_mode (str, optional): The OCR processing mode, e.g., "entire_page" or "individual_blocks".
Defaults to "entire_page". If choose "entire_page" OCR, OCR processes the entire image
page and will be merged with the output layout. If choose "individual_blocks" OCR,
OCR is performed on individual elements by cropping the image.
- pdf_image_dpi (int, optional): DPI (dots per inch) for processing PDF images. Defaults to 200.
Returns:
DocumentLayout: The merged layout information obtained after OCR processing.
"""
from unstructured_inference.inference.layout import DocumentLayout
merged_page_layouts: list[PageLayout] = []
try:
if is_image:
with PILImage.open(filename) as images:
image_format = images.format
for i, image in enumerate(ImageSequence.Iterator(images)):
image = image.convert("RGB")
image.format = image_format
extracted_regions = extracted_layout[i] if i < len(extracted_layout) else None
merged_page_layout = supplement_page_layout_with_ocr(
page_layout=out_layout.pages[i],
image=image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
extracted_regions=extracted_regions,
ocr_layout_dumper=ocr_layout_dumper,
)
merged_page_layouts.append(merged_page_layout)
return DocumentLayout.from_pages(merged_page_layouts)
else:
with tempfile.TemporaryDirectory() as temp_dir:
_image_paths = pdf2image.convert_from_path(
filename,
dpi=pdf_image_dpi,
output_folder=temp_dir,
paths_only=True,
)
image_paths = cast(List[str], _image_paths)
for i, image_path in enumerate(image_paths):
extracted_regions = extracted_layout[i] if i < len(extracted_layout) else None
with PILImage.open(image_path) as image:
merged_page_layout = supplement_page_layout_with_ocr(
page_layout=out_layout.pages[i],
image=image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
extracted_regions=extracted_regions,
ocr_layout_dumper=ocr_layout_dumper,
)
merged_page_layouts.append(merged_page_layout)
return DocumentLayout.from_pages(merged_page_layouts)
except Exception as e:
if os.path.isdir(filename) or os.path.isfile(filename):
raise e
else:
raise FileNotFoundError(f'File "{filename}" not found!') from e
@requires_dependencies("unstructured_inference")
def supplement_page_layout_with_ocr(
page_layout: "PageLayout",
image: PILImage.Image,
infer_table_structure: bool = False,
ocr_languages: str = "eng",
ocr_mode: str = OCRMode.FULL_PAGE.value,
extracted_regions: Optional[TextRegions] = None,
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
) -> "PageLayout":
"""
Supplement an PageLayout with OCR results depending on OCR mode.
If mode is "entire_page", we get the OCR layout for the entire image and
merge it with PageLayout.
If mode is "individual_blocks", we find the elements from PageLayout
with no text and add text from OCR to each element.
"""
ocr_agent = OCRAgent.get_agent(language=ocr_languages)
if ocr_mode == OCRMode.FULL_PAGE.value:
ocr_layout = ocr_agent.get_layout_from_image(image)
if ocr_layout_dumper:
ocr_layout_dumper.add_ocred_page(ocr_layout.as_list())
page_layout.elements_array = merge_out_layout_with_ocr_layout(
out_layout=page_layout.elements_array,
ocr_layout=ocr_layout,
)
elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value:
# individual block mode still keeps using the list data structure for elements instead of
# the vectorized page_layout.elements_array data structure
for i, text in enumerate(page_layout.elements_array.texts):
if text:
continue
padding = env_config.IMAGE_CROP_PAD
cropped_image = image.crop(
(
page_layout.elements_array.x1[i] - padding,
page_layout.elements_array.y1[i] - padding,
page_layout.elements_array.x2[i] + padding,
page_layout.elements_array.y2[i] + padding,
),
)
# Note(yuming): instead of getting OCR layout, we just need
# the text extraced from OCR for individual elements
text_from_ocr = ocr_agent.get_text_from_image(cropped_image)
page_layout.elements_array.texts[i] = text_from_ocr
else:
raise ValueError(
"Invalid OCR mode. Parameter `ocr_mode` "
"must be set to `entire_page` or `individual_blocks`.",
)
# Note(yuming): use the OCR data from entire page OCR for table extraction
if infer_table_structure:
from unstructured_inference.models import tables
tables.load_agent()
if tables.tables_agent is None:
raise RuntimeError("Unable to load table extraction agent.")
page_layout.elements_array = supplement_element_with_table_extraction(
elements=page_layout.elements_array,
image=image,
tables_agent=tables.tables_agent,
ocr_agent=ocr_agent,
extracted_regions=extracted_regions,
)
page_layout.elements = page_layout.elements_array.as_list()
return page_layout
@requires_dependencies("unstructured_inference")
def supplement_element_with_table_extraction(
elements: LayoutElements,
image: PILImage.Image,
tables_agent: "UnstructuredTableTransformerModel",
ocr_agent,
extracted_regions: Optional[TextRegions] = None,
) -> List["LayoutElement"]:
"""Supplement the existing layout with table extraction. Any Table elements
that are extracted will have a metadata fields "text_as_html" where
the table's text content is rendered into a html string and "table_as_cells"
with the raw table cells output from table agent if env_config.EXTRACT_TABLE_AS_CELLS is True
"""
from unstructured_inference.models.tables import cells_to_html
table_id = {v: k for k, v in elements.element_class_id_map.items()}.get(ElementType.TABLE)
if not table_id:
# no table found in this page
return elements
table_ele_indices = np.where(elements.element_class_ids == table_id)[0]
table_elements = elements.slice(table_ele_indices)
padding = env_config.TABLE_IMAGE_CROP_PAD
for i, element_coords in enumerate(table_elements.element_coords):
cropped_image = image.crop(
(
element_coords[0] - padding,
element_coords[1] - padding,
element_coords[2] + padding,
element_coords[3] + padding,
),
)
table_tokens = get_table_tokens(
table_element_image=cropped_image,
ocr_agent=ocr_agent,
)
tatr_cells = tables_agent.predict(
cropped_image, ocr_tokens=table_tokens, result_format="cells"
)
# NOTE(christine): `tatr_cells == ""` means that the table was not recognized
text_as_html = "" if tatr_cells == "" else cells_to_html(tatr_cells)
elements.text_as_html[table_ele_indices[i]] = text_as_html
if env_config.EXTRACT_TABLE_AS_CELLS:
simple_table_cells = [
SimpleTableCell.from_table_transformer_cell(cell).to_dict() for cell in tatr_cells
]
elements.table_as_cells[table_ele_indices[i]] = simple_table_cells
return elements
def get_table_tokens(
table_element_image: PILImage.Image,
ocr_agent: OCRAgent,
) -> List[dict[str, Any]]:
"""Get OCR tokens from either paddleocr or tesseract"""
ocr_layout = ocr_agent.get_layout_from_image(image=table_element_image)
table_tokens = []
for i, text in enumerate(ocr_layout.texts):
table_tokens.append(
{
"bbox": [
ocr_layout.x1[i],
ocr_layout.y1[i],
ocr_layout.x2[i],
ocr_layout.y2[i],
],
"text": text,
# 'table_tokens' is a list of tokens
# Need to be in a relative reading order
"span_num": i,
"line_num": 0,
"block_num": 0,
}
)
return table_tokens
def merge_out_layout_with_ocr_layout(
out_layout: LayoutElements,
ocr_layout: TextRegions,
supplement_with_ocr_elements: bool = True,
subregion_threshold: float = env_config.OCR_LAYOUT_SUBREGION_THRESHOLD,
) -> LayoutElements:
"""
Merge the out layout with the OCR-detected text regions on page level.
This function iterates over each out layout element and aggregates the associated text from
the OCR layout using the specified threshold. The out layout's text attribute is then updated
with this aggregated text. If `supplement_with_ocr_elements` is `True`, the out layout will be
supplemented with the OCR layout.
"""
invalid_text_indices = [i for i, text in enumerate(out_layout.texts) if not valid_text(text)]
out_layout.texts = out_layout.texts.astype(object)
for idx in invalid_text_indices:
out_layout.texts[idx] = aggregate_embedded_text_by_block(
target_region=out_layout.slice([idx]),
source_regions=ocr_layout,
threshold=subregion_threshold,
)
final_layout = (
supplement_layout_with_ocr_elements(out_layout, ocr_layout)
if supplement_with_ocr_elements
else out_layout
)
return final_layout
def aggregate_ocr_text_by_block(
ocr_layout: List["TextRegion"],
region: "TextRegion",
subregion_threshold: float = env_config.OCR_LAYOUT_SUBREGION_THRESHOLD,
) -> Optional[str]:
"""Extracts the text aggregated from the regions of the ocr layout that lie within the given
block."""
extracted_texts = []
for ocr_region in ocr_layout:
ocr_region_is_subregion_of_given_region = ocr_region.bbox.is_almost_subregion_of(
region.bbox,
subregion_threshold,
)
if ocr_region_is_subregion_of_given_region and ocr_region.text:
extracted_texts.append(ocr_region.text)
return " ".join(extracted_texts) if extracted_texts else ""
@requires_dependencies("unstructured_inference")
def supplement_layout_with_ocr_elements(
layout: LayoutElements,
ocr_layout: TextRegions,
subregion_threshold: float = env_config.OCR_LAYOUT_SUBREGION_THRESHOLD,
) -> LayoutElements:
"""
Supplement the existing layout with additional OCR-derived elements.
This function takes two lists: one list of pre-existing layout elements (`layout`)
and another list of OCR-detected text regions (`ocr_layout`). It identifies OCR regions
that are subregions of the elements in the existing layout and removes them from the
OCR-derived list. Then, it appends the remaining OCR-derived regions to the existing layout.
Parameters:
- layout (LayoutElements): A collection of existing layout elements in array structures
- ocr_layout (TextRegions): A collection of OCR-derived text regions in array structures
Returns:
- List[LayoutElement]: The final combined layout consisting of both the original layout
elements and the new OCR-derived elements.
Note:
- The function relies on `is_almost_subregion_of()` method to determine if an OCR region
is a subregion of an existing layout element.
- It also relies on `build_layout_elements_from_ocr_regions()` to convert OCR regions to
layout elements.
- The env_config `OCR_LAYOUT_SUBREGION_THRESHOLD` is used to specify the subregion matching
threshold.
"""
from unstructured_inference.inference.layoutelement import LayoutElements
from unstructured.partition.pdf_image.inference_utils import (
build_layout_elements_from_ocr_regions,
)
mask = (
~bboxes1_is_almost_subregion_of_bboxes2(
ocr_layout.element_coords, layout.element_coords, subregion_threshold
)
.sum(axis=1)
.astype(bool)
)
# add ocr regions that are not covered by layout
ocr_regions_to_add = ocr_layout.slice(mask)
if sum(mask):
ocr_elements_to_add = build_layout_elements_from_ocr_regions(ocr_regions_to_add)
final_layout = LayoutElements.concatenate([layout, ocr_elements_to_add])
else:
final_layout = layout
return final_layout