from __future__ import annotations
from typing import IO, Any
from unstructured.documents.elements import Element
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.html import partition_html
DETECTION_ORIGIN: str = "org"
def partition_org(
filename: str | None = None,
*,
file: IO[bytes] | None = None,
metadata_filename: str | None = None,
metadata_last_modified: str | None = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions an org document. The document is first converted to HTML and then
partitioned using partition_html.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
metadata_last_modified
The last modified date for the document.
"""
exactly_one(filename=filename, file=file)
last_modified = get_last_modified_date(filename) if filename else None
html_text = convert_file_to_html_text_using_pandoc(
source_format="org", filename=filename, file=file
)
return partition_html(
text=html_text,
metadata_filename=metadata_filename or filename,
metadata_file_type=FileType.ORG,
metadata_last_modified=metadata_last_modified or last_modified,
detection_origin=DETECTION_ORIGIN,
**kwargs,
)