from __future__ import annotations from typing import IO, Any, Optional from unstructured.documents.elements import Element from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc from unstructured.file_utils.model import FileType from unstructured.partition.common.common import exactly_one from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.html import partition_html DETECTION_ORIGIN: str = "epub" def partition_epub( filename: Optional[str] = None, *, file: Optional[IO[bytes]] = None, metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, languages: Optional[list[str]] = ["auto"], detect_language_per_element: bool = False, **kwargs: Any, ) -> list[Element]: """Partitions an EPUB document. The document is first converted to HTML and then partitioned using partition_html. Parameters ---------- filename A string defining the target filename path. file A file-like object using "rb" mode --> open(filename, "rb"). metadata_last_modified The last modified date for the document. languages User defined value for `metadata.languages` if provided. Otherwise language is detected using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be in either language. Additional Parameters: detect_language_per_element Detect language per element instead of at the document level. """ exactly_one(filename=filename, file=file) last_modified = get_last_modified_date(filename) if filename else None html_text = convert_file_to_html_text_using_pandoc( source_format="epub", filename=filename, file=file ) return partition_html( text=html_text, metadata_filename=metadata_filename or filename, metadata_file_type=FileType.EPUB, metadata_last_modified=metadata_last_modified or last_modified, languages=languages, detect_language_per_element=detect_language_per_element, detection_origin=DETECTION_ORIGIN, **kwargs, )
Memory