import json import uuid from io import BytesIO from pathlib import Path from typing import Optional from unstructured.partition.pdf_image.analysis.bbox_visualisation import ( AnalysisDrawer, FinalLayoutDrawer, LayoutDrawer, OCRLayoutDrawer, ODModelLayoutDrawer, PdfminerLayoutDrawer, ) from unstructured.partition.pdf_image.analysis.layout_dump import ( ExtractedLayoutDumper, FinalLayoutDumper, JsonLayoutDumper, LayoutDumper, ObjectDetectionLayoutDumper, OCRLayoutDumper, ) def _get_drawer_for_dumper(dumper: LayoutDumper) -> Optional[LayoutDrawer]: """For a given layout dumper, return the corresponding layout drawer instance initialized with a dumped layout dict. Args: dumper: The layout dumper instance Returns: LayoutDrawer: The corresponding layout drawer instance """ if isinstance(dumper, ObjectDetectionLayoutDumper): return ODModelLayoutDrawer(layout_dump=dumper.dump()) elif isinstance(dumper, ExtractedLayoutDumper): return PdfminerLayoutDrawer(layout_dump=dumper.dump()) elif isinstance(dumper, OCRLayoutDumper): return OCRLayoutDrawer(layout_dump=dumper.dump()) elif isinstance(dumper, FinalLayoutDumper): return FinalLayoutDrawer(layout_dump=dumper.dump()) else: raise ValueError(f"Unknown dumper type: {dumper}") def _generate_filename(is_image: bool): """Generate a filename for the analysis artifacts based on the file type. Adds a random uuid suffix """ suffix = uuid.uuid4().hex[:5] if is_image: return f"image_{suffix}.png" return f"pdf_{suffix}.pdf" def save_analysis_artifiacts( *layout_dumpers: LayoutDumper, is_image: bool, analyzed_image_output_dir_path: str, filename: Optional[str] = None, file: Optional[BytesIO] = None, skip_bboxes: bool = False, skip_dump_od: bool = False, draw_grid: bool = False, draw_caption: bool = True, resize: Optional[float] = None, format: str = "png", ): """Save the analysis artifacts for a given file. Loads some settings from the environment configuration. Args: layout_dumpers: The layout dumpers to save and use for bboxes rendering is_image: Flag for the file type (pdf/image) analyzed_image_output_dir_path: The directory to save the analysis artifacts filename: The filename of the sources analyzed file (pdf/image). Only one of filename or file should be provided. file: The file object for the analyzed file. Only one of filename or file should be provided. draw_grid: Flag for drawing the analysis bboxes on a single image (as grid) draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source) resize: Output image resize value. If not provided, the image will not be resized. format: The format for analyzed pages with bboxes drawn on them. Default is 'png'. """ if not filename: filename = _generate_filename(is_image) if skip_bboxes or skip_dump_od: return output_path = Path(analyzed_image_output_dir_path) output_path.mkdir(parents=True, exist_ok=True) if not skip_dump_od: json_layout_dumper = JsonLayoutDumper( filename=filename, save_dir=output_path, ) for layout_dumper in layout_dumpers: json_layout_dumper.add_layout_dumper(layout_dumper) json_layout_dumper.process() if not skip_bboxes: analysis_drawer = AnalysisDrawer( filename=filename, file=file, is_image=is_image, save_dir=output_path, draw_grid=draw_grid, draw_caption=draw_caption, resize=resize, format=format, ) for layout_dumper in layout_dumpers: drawer = _get_drawer_for_dumper(layout_dumper) analysis_drawer.add_drawer(drawer) analysis_drawer.process() def render_bboxes_for_file( filename: str, analyzed_image_output_dir_path: str, renders_output_dir_path: Optional[str] = None, draw_grid: bool = False, draw_caption: bool = True, resize: Optional[float] = None, format: str = "png", ): """Render the bounding boxes for a given layout dimp file. To be used for analysis after the partition is performed for only dumping the layouts - the bboxes can be rendered later. Expects that the analyzed_image_output_dir_path keeps the structure that was created by the save_analysis_artifacts function. Args: filename: The filename of the sources analyzed file (pdf/image) analyzed_image_output_dir_path: The directory where the analysis artifacts (layout dumps) are saved. It should be the root directory of the structure created by the save_analysis_artifacts function. renders_output_dir_path: Optional directory to save the rendered bboxes - if not provided, it will be saved in the analysis directory. draw_grid: Flag for drawing the analysis bboxes on a single image (as grid) draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source) resize: Output image resize value. If not provided, the image will not be resized. format: The format for analyzed pages with bboxes drawn on them. Default is 'png'. """ filename_stem = Path(filename).stem is_image = not Path(filename).suffix.endswith("pdf") analysis_dumps_dir = ( Path(analyzed_image_output_dir_path) / "analysis" / filename_stem / "layout_dump" ) if not analysis_dumps_dir.exists(): return layout_drawers = [] for analysis_dump_filename in analysis_dumps_dir.iterdir(): if not analysis_dump_filename.is_file(): continue with open(analysis_dump_filename) as f: layout_dump = json.load(f) if analysis_dump_filename.stem == "final": layout_drawers.append(FinalLayoutDrawer(layout_dump=layout_dump)) if analysis_dump_filename.stem == "object_detection": layout_drawers.append(ODModelLayoutDrawer(layout_dump=layout_dump)) if analysis_dump_filename.stem == "ocr": layout_drawers.append(OCRLayoutDrawer(layout_dump=layout_dump)) if analysis_dump_filename.stem == "pdfminer": layout_drawers.append(PdfminerLayoutDrawer(layout_dump=layout_dump)) if layout_drawers: if not renders_output_dir_path: output_path = ( Path(analyzed_image_output_dir_path) / "analysis" / filename_stem / "bboxes" ) else: output_path = Path(renders_output_dir_path) output_path.mkdir(parents=True, exist_ok=True) analysis_drawer = AnalysisDrawer( filename=filename, save_dir=output_path, is_image=is_image, draw_grid=draw_grid, draw_caption=draw_caption, resize=resize, format=format, ) for drawer in layout_drawers: analysis_drawer.add_drawer(drawer) analysis_drawer.process()
Memory