from __future__ import annotations import os import tempfile from typing import IO from unstructured.partition.common.common import exactly_one from unstructured.utils import requires_dependencies @requires_dependencies(["pypandoc"]) def convert_file_to_text(filename: str, source_format: str, target_format: str) -> str: """Uses pandoc to convert the source document to a raw text string.""" import pypandoc try: text = pypandoc.convert_file(filename, target_format, format=source_format) except FileNotFoundError as err: msg = ( f"Error converting the file to text. Ensure you have the pandoc package installed on" f" your system. Installation instructions are available at" f" https://pandoc.org/installing.html. The original exception text was:\n{err}" ) raise FileNotFoundError(msg) except RuntimeError as err: supported_source_formats, _ = pypandoc.get_pandoc_formats() if source_format == "rtf" and source_format not in supported_source_formats: additional_info = ( "Support for RTF files is not available in the current pandoc installation. " "It was introduced in pandoc 2.14.2.\n" "Reference: https://pandoc.org/releases.html#pandoc-2.14.2-2021-08-21" ) else: additional_info = "" msg = ( f"{err}\n\n{additional_info}\n\n" f"Current version of pandoc: {pypandoc.get_pandoc_version()}\n" "Make sure you have the right version installed in your system. Please follow the" " pandoc installation instructions in README.md to install the right version." ) raise RuntimeError(msg) return text def convert_file_to_html_text_using_pandoc( source_format: str, filename: str | None = None, file: IO[bytes] | None = None ) -> str: """Converts a document to HTML raw text. Enables the doucment to be processed using `partition_html()`. """ exactly_one(filename=filename, file=file) if file is not None: with tempfile.TemporaryDirectory() as temp_dir_path: tmp_file_path = os.path.join(temp_dir_path, f"tmp_file.{source_format}") with open(tmp_file_path, "wb") as tmp_file: tmp_file.write(file.read()) return convert_file_to_text( filename=tmp_file_path, source_format=source_format, target_format="html" ) assert filename is not None return convert_file_to_text( filename=filename, source_format=source_format, target_format="html" )
Memory