from typing import List, Optional from unstructured.logger import logger from unstructured.partition.utils.constants import PartitionStrategy from unstructured.utils import dependency_exists def validate_strategy(strategy: str, is_image: bool = False): """Determines if the strategy is valid for the specified filetype.""" valid_strategies = [ PartitionStrategy.AUTO, PartitionStrategy.FAST, PartitionStrategy.OCR_ONLY, PartitionStrategy.HI_RES, ] if strategy not in valid_strategies: raise ValueError(f"{strategy} is not a valid strategy.") if strategy == PartitionStrategy.FAST and is_image: raise ValueError("The fast strategy is not available for image files.") def determine_pdf_or_image_strategy( strategy: str, is_image: bool = False, pdf_text_extractable: bool = False, infer_table_structure: bool = False, extract_images_in_pdf: bool = False, extract_image_block_types: Optional[List[str]] = None, ): """Determines what strategy to use for processing PDFs or images, accounting for fallback logic if some dependencies are not available.""" pytesseract_installed = dependency_exists("unstructured_pytesseract") unstructured_inference_installed = dependency_exists("unstructured_inference") if strategy == PartitionStrategy.AUTO: extract_element = extract_images_in_pdf or bool(extract_image_block_types) if is_image: strategy = _determine_image_auto_strategy() else: strategy = _determine_pdf_auto_strategy( pdf_text_extractable=pdf_text_extractable, infer_table_structure=infer_table_structure, extract_element=extract_element, ) if all( [not unstructured_inference_installed, not pytesseract_installed, not pdf_text_extractable], ): raise ValueError( "unstructured_inference is not installed, pytesseract is not installed " "and the text of the PDF is not extractable. " "To process this file, install unstructured_inference, install pytesseract, " "or remove copy protection from the PDF.", ) if strategy == PartitionStrategy.HI_RES and not unstructured_inference_installed: logger.warning( "unstructured_inference is not installed. Cannot use the hi_res partitioning " "strategy. Falling back to partitioning with another strategy.", ) # NOTE(robinson) - fallback to ocr_only if possible because it is the most # similar to hi_res if pytesseract_installed: logger.warning("Falling back to partitioning with ocr_only.") return PartitionStrategy.OCR_ONLY else: logger.warning("Falling back to partitioning with fast.") return PartitionStrategy.FAST elif strategy == PartitionStrategy.OCR_ONLY and not pytesseract_installed: logger.warning( "pytesseract is not installed. Cannot use the ocr_only partitioning " "strategy. Falling back to partitioning with another strategy.", ) if pdf_text_extractable: logger.warning("Falling back to partitioning with fast.") return PartitionStrategy.FAST else: logger.warning("Falling back to partitioning with hi_res.") return PartitionStrategy.HI_RES return strategy def _determine_image_auto_strategy(): """If "auto" is passed in as the strategy, determines what strategy to use for images.""" # Use hi_res as the only default since images are only about one page return PartitionStrategy.HI_RES def _determine_pdf_auto_strategy( pdf_text_extractable: bool = False, infer_table_structure: bool = False, extract_element: bool = False, ): """If "auto" is passed in as the strategy, determines what strategy to use for PDFs.""" # NOTE(robinson) - Currently "hi_res" is the only strategy where # infer_table_structure and extract_images_in_pdf are used. if infer_table_structure or extract_element: return PartitionStrategy.HI_RES if pdf_text_extractable: return PartitionStrategy.FAST else: return PartitionStrategy.OCR_ONLY