"""Handles dispatch of elements to a chunking-strategy by name.
Also provides the `@add_chunking_strategy` decorator which is the chief current user of "by-name"
chunking dispatch.
"""
from __future__ import annotations
import dataclasses as dc
import functools
import inspect
from typing import Any, Callable, Iterable, Optional, Protocol
from typing_extensions import ParamSpec
from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import Element
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
_P = ParamSpec("_P")
class Chunker(Protocol):
"""Abstract interface for chunking functions."""
def __call__(
self, elements: Iterable[Element], *, max_characters: Optional[int]
) -> list[Element]:
"""A chunking function must have this signature.
In particular it must minimally have an `elements` parameter and all chunkers will have a
`max_characters` parameter (doesn't need to follow `elements` directly). All others can
vary by chunker.
"""
...
def add_chunking_strategy(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
"""Decorator for chunking text.
Chunks the element sequence produced by the partitioner it decorates when a `chunking_strategy`
argument is present in the partitioner call and it names an available chunking strategy.
"""
# -- Patch the docstring of the decorated function to add chunking strategy and
# -- chunking-related argument documentation. This only applies when `chunking_strategy`
# -- is an explicit argument of the decorated function and "chunking_strategy" is not
# -- already mentioned in the docstring.
if func.__doc__ and (
"chunking_strategy" in func.__code__.co_varnames and "chunking_strategy" not in func.__doc__
):
func.__doc__ += (
"\nchunking_strategy"
+ "\n\tStrategy used for chunking text into larger or smaller elements."
+ "\n\tDefaults to `None` with optional arg of 'basic' or 'by_title'."
+ "\n\tAdditional Parameters:"
+ "\n\t\tmultipage_sections"
+ "\n\t\t\tIf True, sections can span multiple pages. Defaults to True."
+ "\n\t\tcombine_text_under_n_chars"
+ "\n\t\t\tCombines elements (for example a series of titles) until a section"
+ "\n\t\t\treaches a length of n characters. Only applies to 'by_title' strategy."
+ "\n\t\tnew_after_n_chars"
+ "\n\t\t\tCuts off chunks once they reach a length of n characters; a soft max."
+ "\n\t\tmax_characters"
+ "\n\t\t\tChunks elements text and text_as_html (if present) into chunks"
+ "\n\t\t\tof length n characters, a hard max."
)
@functools.wraps(func)
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
"""The decorated function is replaced with this one."""
# -- call the partitioning function to get the elements --
elements = func(*args, **kwargs)
# -- look for a chunking-strategy argument --
call_args = get_call_args_applying_defaults(func, *args, **kwargs)
chunking_strategy = call_args.pop("chunking_strategy", None)
# -- no chunking-strategy means no chunking --
if chunking_strategy is None:
return elements
# -- otherwise, chunk away :) --
return chunk(elements, chunking_strategy, **call_args)
return wrapper
def chunk(elements: Iterable[Element], chunking_strategy: str, **kwargs: Any) -> list[Element]:
"""Dispatch chunking of `elements` to the chunking function for `chunking_strategy`."""
chunker_spec = _chunker_registry.get(chunking_strategy)
if chunker_spec is None:
raise ValueError(f"unrecognized chunking strategy {repr(chunking_strategy)}")
# -- `kwargs` will in general be an omnibus dict of all keyword arguments to the partitioner;
# -- pick out and use only those supported by this chunker.
chunking_kwargs = {k: v for k, v in kwargs.items() if k in chunker_spec.kw_arg_names}
return chunker_spec.chunker(elements, **chunking_kwargs)
def register_chunking_strategy(name: str, chunker: Chunker) -> None:
"""Make chunker available by using `name` as `chunking_strategy` arg in partitioner call."""
_chunker_registry[name] = _ChunkerSpec(chunker)
@dc.dataclass(frozen=True)
class _ChunkerSpec:
"""A registry entry for a chunker."""
chunker: Chunker
"""The "chunk_by_{x}() function that implements this chunking strategy."""
@lazyproperty
def kw_arg_names(self) -> tuple[str, ...]:
"""Keyword arguments supported by this chunker.
These are all arguments other than the required `elements: list[Element]` first parameter.
"""
sig = inspect.signature(self.chunker)
return tuple(key for key in sig.parameters if key != "elements")
_chunker_registry: dict[str, _ChunkerSpec] = {
"basic": _ChunkerSpec(chunk_elements),
"by_title": _ChunkerSpec(chunk_by_title),
}