"""Domain-model for file-types."""
from __future__ import annotations
import enum
from typing import Iterable, cast
class FileType(enum.Enum):
"""The collection of file-types recognized by `unstructured`.
Note not all of these can be partitioned, e.g. WAV and ZIP have no partitioner.
"""
_partitioner_shortname: str | None
"""Like "docx", from which partitioner module and function-name can be derived via template."""
_importable_package_dependencies: tuple[str, ...]
"""Packages that must be available for import for this file-type's partitioner to work."""
_extra_name: str | None
"""`pip install` extra that provides package dependencies for this file-type."""
_extensions: tuple[str, ...]
"""Filename-extensions recognized as this file-type. Use for secondary identification only."""
_canonical_mime_type: str
"""The MIME-type used as `.metadata.filetype` for this file-type."""
_alias_mime_types: tuple[str, ...]
"""MIME-types accepted as identifying this file-type."""
def __new__(
cls,
value: str,
partitioner_shortname: str | None,
importable_package_dependencies: Iterable[str],
extra_name: str | None,
extensions: Iterable[str],
canonical_mime_type: str,
alias_mime_types: Iterable[str],
):
self = object.__new__(cls)
self._value_ = value
self._partitioner_shortname = partitioner_shortname
self._importable_package_dependencies = tuple(importable_package_dependencies)
self._extra_name = extra_name
self._extensions = tuple(extensions)
self._canonical_mime_type = canonical_mime_type
self._alias_mime_types = tuple(alias_mime_types)
return self
def __lt__(self, other: FileType) -> bool:
"""Makes `FileType` members comparable with relational operators, at least with `<`.
This makes them sortable, in particular it supports sorting for pandas groupby functions.
"""
return self.name < other.name
@classmethod
def from_extension(cls, extension: str | None) -> FileType | None:
"""Select a FileType member based on an extension.
`extension` must include the leading period, like `".pdf"`. Extension is suitable as a
secondary file-type identification method but is unreliable for primary identification.
Returns `None` when `extension` is not registered for any supported file-type.
"""
if extension in (None, "", "."):
return None
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
# -- limitations on defining a class variable on an Enum.
for m in cls.__members__.values():
if extension in m._extensions:
return m
return None
@classmethod
def from_mime_type(cls, mime_type: str | None) -> FileType | None:
"""Select a FileType member based on a MIME-type.
Returns `None` when `mime_type` is `None` or does not map to the canonical MIME-type of a
`FileType` member or one of its alias MIME-types.
"""
if mime_type is None:
return None
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
# -- limitations on defining a class variable on an Enum.
for m in cls.__members__.values():
if mime_type == m._canonical_mime_type or mime_type in m._alias_mime_types:
return m
return None
@property
def extra_name(self) -> str | None:
"""The `pip` "extra" that must be installed to provide this file-type's dependencies.
Like "image" for PNG, as in `pip install "unstructured[image]"`.
`None` when partitioning this file-type requires only the base `unstructured` install.
"""
return self._extra_name
@property
def importable_package_dependencies(self) -> tuple[str, ...]:
"""Packages that must be importable for this file-type's partitioner to work.
In general, these are the packages provided by the `pip install` "extra" for this file-type,
like `pip install "unstructured[docx]"` loads the `python-docx` package.
Note that these names are the ones used in an `import` statement, which is not necessarily
the same as the _distribution_ package name used by `pip`. For example, the DOCX
distribution package name is `"python-docx"` whereas the _importable_ package name is
`"docx"`. This latter name as it appears like `import docx` is what is provided by this
property.
The return value is an empty tuple for file-types that do not require optional dependencies.
Note this property does not complain when accessed on a non-partitionable file-type, it
simply returns an empty tuple because file-types that are not partitionable require no
optional dependencies.
"""
return self._importable_package_dependencies
@property
def is_partitionable(self) -> bool:
"""True when there is a partitioner for this file-type.
Note this does not check whether the dependencies for this file-type are installed so
attempting to partition a file of this type may still fail. This is meant for
distinguishing file-types like WAV, ZIP, EMPTY, and UNK which are legitimate file-types
but have no associated partitioner.
"""
return bool(self._partitioner_shortname)
@property
def mime_type(self) -> str:
"""The canonical MIME-type for this file-type, suitable for use in metadata.
This value is used in `.metadata.filetype` for elements partitioned from files of this
type. In general it is the "offical", "recommended", or "defacto-standard" MIME-type for
files of this type, in that order, as available.
"""
return self._canonical_mime_type
@property
def partitioner_function_name(self) -> str:
"""Name of partitioner function for this file-type. Like "partition_docx".
Raises when this property is accessed on a file-type that is not partitionable. Use
`.is_partitionable` to avoid exceptions when partitionability is unknown.
"""
# -- Raise when this property is accessed on a FileType member that has no partitioner
# -- shortname. This prevents a harder-to-find bug from appearing far away from this call
# -- when code would try to `getattr(module, None)` or whatever.
if (shortname := self._partitioner_shortname) is None:
raise ValueError(
f"`.partitioner_function_name` is undefined because FileType.{self.name} is not"
f" partitionable. Use `.is_partitionable` to determine whether a `FileType`"
f" is partitionable."
)
return f"partition_{shortname}"
@property
def partitioner_module_qname(self) -> str:
"""Fully-qualified name of module providing partitioner for this file-type.
e.g. "unstructured.partition.docx" for FileType.DOCX.
"""
# -- Raise when this property is accessed on a FileType member that has no partitioner
# -- shortname. This prevents a harder-to-find bug from appearing far away from this call
# -- when code would try to `importlib.import_module(None)` or whatever.
if (shortname := self._partitioner_shortname) is None:
raise ValueError(
f"`.partitioner_module_qname` is undefined because FileType.{self.name} is not"
f" partitionable. Use `.is_partitionable` to determine whether a `FileType`"
f" is partitionable."
)
return f"unstructured.partition.{shortname}"
@property
def partitioner_shortname(self) -> str | None:
"""Familiar name of partitioner, like "image" for file-types that use `partition_image()`.
One use is to determine whether a file-type is one of the five image types, all of which
are processed by `partition_image()`.
`None` for file-types that are not partitionable, although `.is_partitionable` is the
preferred way of discovering that.
"""
return self._partitioner_shortname
BMP = (
"bmp", # -- value for this Enum member, like BMP = "bmp" in a simple enum --
"image", # -- partitioner_shortname --
["unstructured_inference"], # -- importable_package_dependencies --
"image", # -- extra_name - like `pip install "unstructured[image]"` in this case --
[".bmp"], # -- extensions - filename extensions that map to this file-type --
"image/bmp", # -- canonical_mime_type - MIME-type written to `.metadata.filetype` --
cast(list[str], []), # -- alias_mime-types - other MIME-types that map to this file-type --
)
CSV = (
"csv",
"csv",
["pandas"],
"csv",
[".csv"],
"text/csv",
[
"application/csv",
"application/x-csv",
"text/comma-separated-values",
"text/x-comma-separated-values",
"text/x-csv",
],
)
DOC = ("doc", "doc", ["docx"], "doc", [".doc"], "application/msword", cast(list[str], []))
DOCX = (
"docx",
"docx",
["docx"],
"docx",
[".docx"],
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
cast(list[str], []),
)
EML = (
"eml",
"email",
cast(list[str], []),
None,
[".eml", ".p7s"],
"message/rfc822",
cast(list[str], []),
)
EPUB = (
"epub",
"epub",
["pypandoc"],
"epub",
[".epub"],
"application/epub",
["application/epub+zip"],
)
HEIC = (
"heic",
"image",
["unstructured_inference"],
"image",
[".heic"],
"image/heic",
cast(list[str], []),
)
HTML = (
"html",
"html",
cast(list[str], []),
None,
[".html", ".htm"],
"text/html",
cast(list[str], []),
)
JPG = (
"jpg",
"image",
["unstructured_inference"],
"image",
[".jpeg", ".jpg"],
"image/jpeg",
cast(list[str], []),
)
JSON = (
"json",
"json",
cast(list[str], []),
None,
[".json"],
"application/json",
cast(list[str], []),
)
MD = ("md", "md", ["markdown"], "md", [".md"], "text/markdown", ["text/x-markdown"])
MSG = (
"msg",
"msg",
["oxmsg"],
"msg",
[".msg"],
"application/vnd.ms-outlook",
cast(list[str], []),
)
NDJSON = (
"ndjson",
"ndjson",
["ndjson"],
None,
[".ndjson"],
"application/x-ndjson",
cast(list[str], []),
)
ODT = (
"odt",
"odt",
["docx", "pypandoc"],
"odt",
[".odt"],
"application/vnd.oasis.opendocument.text",
cast(list[str], []),
)
ORG = ("org", "org", ["pypandoc"], "org", [".org"], "text/org", cast(list[str], []))
PDF = (
"pdf",
"pdf",
["pdf2image", "pdfminer", "PIL"],
"pdf",
[".pdf"],
"application/pdf",
cast(list[str], []),
)
PNG = (
"png",
"image",
["unstructured_inference"],
"image",
[".png"],
"image/png",
cast(list[str], []),
)
PPT = (
"ppt",
"ppt",
["pptx"],
"ppt",
[".ppt"],
"application/vnd.ms-powerpoint",
cast(list[str], []),
)
PPTX = (
"pptx",
"pptx",
["pptx"],
"pptx",
[".pptx"],
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
cast(list[str], []),
)
RST = ("rst", "rst", ["pypandoc"], "rst", [".rst"], "text/x-rst", cast(list[str], []))
RTF = ("rtf", "rtf", ["pypandoc"], "rtf", [".rtf"], "text/rtf", ["application/rtf"])
TIFF = (
"tiff",
"image",
["unstructured_inference"],
"image",
[".tiff"],
"image/tiff",
cast(list[str], []),
)
TSV = ("tsv", "tsv", ["pandas"], "tsv", [".tab", ".tsv"], "text/tsv", cast(list[str], []))
TXT = (
"txt",
"text",
cast(list[str], []),
None,
[
".txt",
".text",
# NOTE(robinson) - for now we are treating code files as plain text
".c",
".cc",
".cpp",
".cs",
".cxx",
".go",
".java",
".js",
".log",
".php",
".py",
".rb",
".swift",
".ts",
".yaml",
".yml",
],
"text/plain",
[
# NOTE(robinson) - In the future, we may have special processing for YAML files
# instead of treating them as plaintext.
"text/yaml",
"application/x-yaml",
"application/yaml",
"text/x-yaml",
],
)
WAV = (
"wav",
None,
cast(list[str], []),
None,
[".wav"],
"audio/wav",
[
"audio/vnd.wav",
"audio/vnd.wave",
"audio/wave",
"audio/x-pn-wav",
"audio/x-wav",
],
)
XLS = (
"xls",
"xlsx",
["pandas", "openpyxl"],
"xlsx",
[".xls"],
"application/vnd.ms-excel",
cast(list[str], []),
)
XLSX = (
"xlsx",
"xlsx",
["pandas", "openpyxl"],
"xlsx",
[".xlsx"],
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
cast(list[str], []),
)
XML = ("xml", "xml", cast(list[str], []), None, [".xml"], "application/xml", ["text/xml"])
ZIP = ("zip", None, cast(list[str], []), None, [".zip"], "application/zip", cast(list[str], []))
UNK = (
"unk",
None,
cast(list[str], []),
None,
cast(list[str], []),
"application/octet-stream",
cast(list[str], []),
)
EMPTY = (
"empty",
None,
cast(list[str], []),
None,
cast(list[str], []),
"inode/x-empty",
cast(list[str], []),
)