from __future__ import annotations
import os
import tempfile
from typing import IO, Any, Optional
from unstructured.documents.elements import Element
from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import convert_office_doc, exactly_one
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.pptx import partition_pptx
def partition_ppt(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions Microsoft PowerPoint Documents in .ppt format into their document elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
metadata_last_modified
The last modified date for the document.
Note that all arguments valid on `partition_pptx()` are also valid here and will be passed
along to the `partition_pptx()` function.
"""
# -- Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file)
last_modified = get_last_modified_date(filename) if filename else None
with tempfile.TemporaryDirectory() as tmpdir:
if filename:
# -- Verify filename.
if not os.path.exists(filename):
raise ValueError(f"The file {filename} does not exist.")
else:
assert file
# -- Create filename.
tmp_file_path = os.path.join(tmpdir, "tmp_file")
with open(tmp_file_path, "wb") as tmp_file:
tmp_file.write(file.read())
filename = tmp_file_path
_, filename_no_path = os.path.split(os.path.abspath(filename))
base_filename, _ = os.path.splitext(filename_no_path)
convert_office_doc(
filename,
tmpdir,
target_format="pptx",
target_filter="Impress MS PowerPoint 2007 XML",
)
pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
elements = partition_pptx(
filename=pptx_filename,
metadata_filename=metadata_filename or filename,
metadata_file_type=FileType.PPT,
metadata_last_modified=metadata_last_modified or last_modified,
**kwargs,
)
# -- Remove tmp.name from filename if parsing file
if file:
for element in elements:
element.metadata.filename = metadata_filename
return elements