"""Implementation of chunking by title. Main entry point is the `@add_chunking_strategy()` decorator. """ from __future__ import annotations from typing import Iterable, Iterator, Optional from unstructured.chunking.base import ( CHUNK_MULTI_PAGE_DEFAULT, BoundaryPredicate, ChunkingOptions, PreChunkCombiner, PreChunker, is_on_next_page, is_title, ) from unstructured.documents.elements import Element from unstructured.utils import lazyproperty def chunk_by_title( elements: Iterable[Element], *, combine_text_under_n_chars: Optional[int] = None, include_orig_elements: Optional[bool] = None, max_characters: Optional[int] = None, multipage_sections: Optional[bool] = None, new_after_n_chars: Optional[int] = None, overlap: Optional[int] = None, overlap_all: Optional[bool] = None, ) -> list[Element]: """Uses title elements to identify sections within the document for chunking. Splits off into a new CompositeElement when a title is detected or if metadata changes, which happens when page numbers or sections change. Cuts off sections once they have exceeded a character length of max_characters. Parameters ---------- elements A list of unstructured elements. Usually the output of a partition function. combine_text_under_n_chars Combines elements (for example a series of titles) until a section reaches a length of n characters. Defaults to `max_characters` which combines chunks whenever space allows. Specifying 0 for this argument suppresses combining of small chunks. Note this value is "capped" at the `new_after_n_chars` value since a value higher than that would not change this parameter's effect. include_orig_elements When `True` (default), add elements from pre-chunk to the `.metadata.orig_elements` field of the chunk(s) formed from that pre-chunk. Among other things, this allows access to original-element metadata that cannot be consolidated and is dropped in the course of chunking. max_characters Chunks elements text and text_as_html (if present) into chunks of length n characters (hard max) multipage_sections If True, sections can span multiple pages. Defaults to True. new_after_n_chars Cuts off new sections once they reach a length of n characters (soft max). Defaults to `max_characters` when not specified, which effectively disables any soft window. Specifying 0 for this argument causes each element to appear in a chunk by itself (although an element with text longer than `max_characters` will be still be split into two or more chunks). overlap Specifies the length of a string ("tail") to be drawn from each chunk and prefixed to the next chunk as a context-preserving mechanism. By default, this only applies to split-chunks where an oversized element is divided into multiple chunks by text-splitting. overlap_all Default: `False`. When `True`, apply overlap between "normal" chunks formed from whole elements and not subject to text-splitting. Use this with caution as it entails a certain level of "pollution" of otherwise clean semantic chunk boundaries. """ opts = _ByTitleChunkingOptions.new( combine_text_under_n_chars=combine_text_under_n_chars, include_orig_elements=include_orig_elements, max_characters=max_characters, multipage_sections=multipage_sections, new_after_n_chars=new_after_n_chars, overlap=overlap, overlap_all=overlap_all, ) return _chunk_by_title(elements, opts) def _chunk_by_title(elements: Iterable[Element], opts: _ByTitleChunkingOptions) -> list[Element]: """Implementation of actual "by-title" chunking.""" # -- Note(scanny): it might seem like over-abstraction for this to be a separate function but # -- it eases overriding or adding individual chunking options when customizing a stock chunker. pre_chunks = PreChunkCombiner( PreChunker.iter_pre_chunks(elements, opts), opts=opts ).iter_combined_pre_chunks() return [chunk for pre_chunk in pre_chunks for chunk in pre_chunk.iter_chunks()] class _ByTitleChunkingOptions(ChunkingOptions): """Adds the by-title-specific chunking options to the base case. `by_title`-specific options: combine_text_under_n_chars A remedy to over-chunking caused by elements mis-identified as Title elements. Every Title element would start a new chunk and this setting mitigates that, at the expense of sometimes violating legitimate semantic boundaries. multipage_sections Indicates that page-boundaries should not be respected while chunking, i.e. elements appearing on two different pages can appear in the same chunk. """ @lazyproperty def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: """The semantic-boundary detectors to be applied to break pre-chunks. For the `by_title` strategy these are sections indicated by a title (section-heading), an explicit section metadata item (only present for certain document types), and optionally page boundaries. """ def iter_boundary_predicates() -> Iterator[BoundaryPredicate]: yield is_title if not self.multipage_sections: yield is_on_next_page() return tuple(iter_boundary_predicates()) @lazyproperty def combine_text_under_n_chars(self) -> int: """Combine consecutive text pre-chunks if former is smaller than this and both will fit. - Does not combine text chunks if together they would exceed the chunking window. - Defaults to `max_characters` when not specified. - Is reduced to `new_after_n_chars` when it exceeds that value. """ # -- `combine_text_under_n_chars` defaults to `max_characters` when not specified -- arg_value = self._kwargs.get("combine_text_under_n_chars") return self.hard_max if arg_value is None else arg_value @lazyproperty def multipage_sections(self) -> bool: """When False, break pre-chunks on page-boundaries.""" arg_value = self._kwargs.get("multipage_sections") return CHUNK_MULTI_PAGE_DEFAULT if arg_value is None else bool(arg_value) def _validate(self) -> None: """Raise ValueError if request option-set is invalid.""" # -- start with base-class validations -- super()._validate() # -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination) # -- but a negative value is not if self.combine_text_under_n_chars < 0: raise ValueError( f"'combine_text_under_n_chars' argument must be >= 0," f" got {self.combine_text_under_n_chars}" ) # -- `combine_text_under_n_chars` > `max_characters` can produce behavior confusing to # -- users. The chunking behavior would be no different than when # -- `combine_text_under_n_chars == max_characters`, but if `max_characters` is left to # -- default (500) then it can look like chunk-combining isn't working. if self.combine_text_under_n_chars > self.hard_max: raise ValueError( f"'combine_text_under_n_chars' argument must not exceed `max_characters`" f" value, got {self.combine_text_under_n_chars} > {self.hard_max}" )