import os import tempfile from typing import BinaryIO, List, Tuple from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage from pdfminer.psparser import PSSyntaxError from unstructured.logger import logger from unstructured.utils import requires_dependencies def init_pdfminer(): rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return device, interpreter def extract_image_objects(parent_object: LTItem) -> List[LTImage]: """Recursively extracts image objects from a given parent object in a PDF document.""" objects = [] if isinstance(parent_object, LTImage): objects.append(parent_object) elif isinstance(parent_object, LTContainer): for child in parent_object: objects.extend(extract_image_objects(child)) return objects def extract_text_objects(parent_object: LTItem) -> List[LTTextLine]: """Recursively extracts text objects from a given parent object in a PDF document.""" objects = [] if isinstance(parent_object, LTTextLine): objects.append(parent_object) elif isinstance(parent_object, LTContainer): for child in parent_object: objects.extend(extract_text_objects(child)) return objects def rect_to_bbox( rect: Tuple[float, float, float, float], height: float, ) -> Tuple[float, float, float, float]: """ Converts a PDF rectangle coordinates (x1, y1, x2, y2) to a bounding box in the specified coordinate system where the vertical axis is measured from the top of the page. Args: rect (Tuple[float, float, float, float]): A tuple representing a PDF rectangle coordinates (x1, y1, x2, y2). height (float): The height of the page in the specified coordinate system. Returns: Tuple[float, float, float, float]: A tuple representing the bounding box coordinates (x1, y1, x2, y2) with the y-coordinates adjusted to be measured from the top of the page. """ x1, y2, x2, y1 = rect y1 = height - y1 y2 = height - y2 return (x1, y1, x2, y2) @requires_dependencies(["pikepdf", "pypdf"]) def open_pdfminer_pages_generator( fp: BinaryIO, ): """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" import pikepdf from unstructured.partition.pdf_image.pypdf_utils import get_page_data device, interpreter = init_pdfminer() with tempfile.TemporaryDirectory() as tmp_dir_path: tmp_file_path = os.path.join(tmp_dir_path, "tmp_file") try: pages = PDFPage.get_pages(fp) # Detect invalid dictionary construct for entire PDF for i, page in enumerate(pages): try: # Detect invalid dictionary construct for one page interpreter.process_page(page) page_layout = device.get_result() except PSSyntaxError: logger.info("Detected invalid dictionary construct for PDFminer") logger.info(f"Repairing the PDF page {i+1} ...") # find the error page from binary data fp error_page_data = get_page_data(fp, page_number=i) # repair the error page with pikepdf with pikepdf.Pdf.open(error_page_data) as pdf: pdf.save(tmp_file_path) page = next(PDFPage.get_pages(open(tmp_file_path, "rb"))) # noqa: SIM115 interpreter.process_page(page) page_layout = device.get_result() yield page, page_layout except PSSyntaxError: logger.info("Detected invalid dictionary construct for PDFminer") logger.info("Repairing the PDF document ...") # repair the entire doc with pikepdf with pikepdf.Pdf.open(fp) as pdf: pdf.save(tmp_file_path) pages = PDFPage.get_pages(open(tmp_file_path, "rb")) # noqa: SIM115 for page in pages: interpreter.process_page(page) page_layout = device.get_result() yield page, page_layout