"""Implementation of baseline chunking. This is the "plain-vanilla" chunking strategy. All the fundamental chunking behaviors are present in this strategy and also in all other strategies. Those are: - Maximally fill each chunk with sequential elements. - Isolate oversized elements and divide (only) those chunks by text-splitting. - Overlap when requested. "Fancier" strategies add higher-level semantic-unit boundaries to be respected. For example, in the by-title strategy, section boundaries are respected, meaning a chunk never contains text from two different sections. When a new section is detected the current chunk is closed and a new one started. """ from __future__ import annotations from typing import Iterable, Optional from unstructured.chunking.base import ChunkingOptions, PreChunker from unstructured.documents.elements import Element def chunk_elements( elements: Iterable[Element], *, include_orig_elements: Optional[bool] = None, max_characters: Optional[int] = None, new_after_n_chars: Optional[int] = None, overlap: Optional[int] = None, overlap_all: Optional[bool] = None, ) -> list[Element]: """Combine sequential `elements` into chunks, respecting specified text-length limits. Produces a sequence of `CompositeElement`, `Table`, and `TableChunk` elements (chunks). Parameters ---------- elements A list of unstructured elements. Usually the output of a partition function. include_orig_elements When `True` (default), add elements from pre-chunk to the `.metadata.orig_elements` field of the chunk(s) formed from that pre-chunk. Among other things, this allows access to original-element metadata that cannot be consolidated and is dropped in the course of chunking. max_characters Hard maximum chunk length. No chunk will exceed this length. A single element that exceeds this length will be divided into two or more chunks using text-splitting. new_after_n_chars A chunk that of this length or greater is not extended to include the next element, even if that element would fit without exceeding `max_characters`. A "soft max" length that can be used in conjunction with `max_characters` to limit most chunks to a preferred length while still allowing larger elements to be included in a single chunk without resorting to text-splitting. Defaults to `max_characters` when not specified, which effectively disables any soft window. Specifying 0 for this argument causes each element to appear in a chunk by itself (although an element with text longer than `max_characters` will be still be split into two or more chunks). overlap Specifies the length of a string ("tail") to be drawn from each chunk and prefixed to the next chunk as a context-preserving mechanism. By default, this only applies to split-chunks where an oversized element is divided into multiple chunks by text-splitting. overlap_all Default: `False`. When `True`, apply overlap between "normal" chunks formed from whole elements and not subject to text-splitting. Use this with caution as it produces a certain level of "pollution" of otherwise clean semantic chunk boundaries. """ # -- raises ValueError on invalid parameters -- opts = _BasicChunkingOptions.new( include_orig_elements=include_orig_elements, max_characters=max_characters, new_after_n_chars=new_after_n_chars, overlap=overlap, overlap_all=overlap_all, ) return _chunk_elements(elements, opts) def _chunk_elements(elements: Iterable[Element], opts: _BasicChunkingOptions) -> list[Element]: """Implementation of actual basic chunking.""" # -- Note(scanny): it might seem like over-abstraction for this to be a separate function but # -- it eases overriding or adding individual chunking options when customizing a stock chunker. return [ chunk for pre_chunk in PreChunker.iter_pre_chunks(elements, opts) for chunk in pre_chunk.iter_chunks() ] class _BasicChunkingOptions(ChunkingOptions): """Options for `basic` chunking."""
Memory