from __future__ import annotations
from typing import IO, Any
import markdown
import requests
from unstructured.documents.elements import Element
from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.html import partition_html
def optional_decode(contents: str | bytes) -> str:
if isinstance(contents, bytes):
return contents.decode("utf-8")
return contents
DETECTION_ORIGIN: str = "md"
def partition_md(
filename: str | None = None,
file: IO[bytes] | None = None,
text: str | None = None,
url: str | None = None,
metadata_filename: str | None = None,
metadata_last_modified: str | None = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions a markdown file into its constituent elements
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
text
The string representation of the markdown document.
url
The URL of a webpage to parse. Only for URLs that return a markdown document.
metadata_last_modified
The last modified date for the document.
"""
if text is None:
text = ""
# -- verify that only one of the arguments was provided --
exactly_one(filename=filename, file=file, text=text, url=url)
last_modified = get_last_modified_date(filename) if filename else None
if filename is not None:
with open(filename, encoding="utf8") as f:
text = optional_decode(f.read())
elif file is not None:
text = optional_decode(file.read())
elif url is not None:
response = requests.get(url)
if not response.ok:
raise ValueError(f"URL return an error: {response.status_code}")
content_type = response.headers.get("Content-Type", "")
if not content_type.startswith("text/markdown"):
raise ValueError(
f"Expected content type text/markdown. Got {content_type}.",
)
text = response.text
html = markdown.markdown(text, extensions=["tables"])
return partition_html(
text=html,
metadata_filename=metadata_filename or filename,
metadata_file_type=FileType.MD,
metadata_last_modified=metadata_last_modified or last_modified,
detection_origin=DETECTION_ORIGIN,
**kwargs,
)