from __future__ import annotations
import copy
import re
from typing import IO, Any, Callable, Literal
from unstructured.chunking import add_chunking_strategy
from unstructured.cleaners.core import (
auto_paragraph_grouper,
clean_bullets,
)
from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import (
Address,
Element,
ElementMetadata,
EmailAddress,
Footer,
Header,
ListItem,
NarrativeText,
Text,
Title,
)
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.model import FileType
from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
from unstructured.partition.text_type import (
is_bulleted_text,
is_email_address,
is_possible_narrative_text,
is_possible_numbered_list,
is_possible_title,
is_us_city_state_zip,
)
@apply_metadata(FileType.TXT)
@add_chunking_strategy
def partition_text(
filename: str | None = None,
*,
file: IO[bytes] | None = None,
encoding: str | None = None,
text: str | None = None,
paragraph_grouper: Callable[[str], str] | Literal[False] | None = None,
detection_origin: str | None = "text",
**kwargs: Any,
) -> list[Element]:
"""Partition a .txt documents into its constituent paragraph elements.
If paragraphs are below "min_partition" or above "max_partition" boundaries,
they are combined or split.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
encoding
The encoding method used to decode the input bytes when drawn from `filename` or `file`.
Defaults to "utf-8".
text
The string representation of the .txt document.
paragrapher_grouper
A str -> str function for fixing paragraphs that are interrupted by line breaks
for formatting purposes.
"""
if text is not None and text.strip() == "" and not file and not filename:
return []
# -- Verify that only one of the arguments was provided --
exactly_one(filename=filename, file=file, text=text)
file_text = ""
if filename is not None:
encoding, file_text = read_txt_file(filename=filename, encoding=encoding)
elif file is not None:
encoding, file_text = read_txt_file(file=file, encoding=encoding)
elif text is not None:
file_text = str(text)
if paragraph_grouper is False:
pass
elif paragraph_grouper is not None:
file_text = paragraph_grouper(file_text)
else:
file_text = auto_paragraph_grouper(file_text)
file_content = _split_by_paragraph(file_text)
elements: list[Element] = []
metadata = ElementMetadata(
last_modified=get_last_modified_date(filename) if filename else None,
)
metadata.detection_origin = detection_origin
for ctext in file_content:
ctext = ctext.strip()
if ctext and not _is_empty_bullet(ctext):
element = element_from_text(ctext)
element.metadata = copy.deepcopy(metadata)
elements.append(element)
return elements
def element_from_text(
text: str,
coordinates: tuple[tuple[float, float], ...] | None = None,
coordinate_system: CoordinateSystem | None = None,
) -> Element:
if _is_in_header_position(coordinates, coordinate_system):
return Header(
text=text,
coordinates=coordinates,
coordinate_system=coordinate_system,
)
elif _is_in_footer_position(coordinates, coordinate_system):
return Footer(
text=text,
coordinates=coordinates,
coordinate_system=coordinate_system,
)
elif is_bulleted_text(text):
clean_text = clean_bullets(text)
return ListItem(
text=clean_text,
coordinates=coordinates,
coordinate_system=coordinate_system,
)
elif is_email_address(text):
return EmailAddress(text=text)
elif is_us_city_state_zip(text):
return Address(
text=text,
coordinates=coordinates,
coordinate_system=coordinate_system,
)
elif is_possible_numbered_list(text):
return ListItem(
text=text,
coordinates=coordinates,
coordinate_system=coordinate_system,
)
elif is_possible_narrative_text(text):
return NarrativeText(
text=text,
coordinates=coordinates,
coordinate_system=coordinate_system,
)
elif is_possible_title(text):
return Title(
text=text,
coordinates=coordinates,
coordinate_system=coordinate_system,
)
else:
return Text(
text=text,
coordinates=coordinates,
coordinate_system=coordinate_system,
)
# ================================================================================================
# HELPER FUNCTIONS
# ================================================================================================
def _get_height_percentage(
coordinates: tuple[tuple[float, float], ...],
coordinate_system: CoordinateSystem,
) -> float:
avg_y = sum(coordinate[1] for coordinate in coordinates) / len(coordinates)
return avg_y / coordinate_system.height
def _is_empty_bullet(text: str) -> bool:
"""Checks if input text is an empty bullet."""
return bool(UNICODE_BULLETS_RE.match(text) and len(text) == 1)
def _is_in_footer_position(
coordinates: tuple[tuple[float, float], ...] | None,
coordinate_system: CoordinateSystem | None,
threshold: float = 0.93,
) -> bool:
"""Checks to see if the position of the text indicates that the text belongs
to a footer."""
if coordinates is None or coordinate_system is None:
return False
height_percentage = _get_height_percentage(coordinates, coordinate_system)
return height_percentage > threshold
def _is_in_header_position(
coordinates: tuple[tuple[float, float], ...] | None,
coordinate_system: CoordinateSystem | None,
threshold: float = 0.07,
) -> bool:
"""Checks to see if the position of the text indicates that the text belongs to a header."""
if coordinates is None or coordinate_system is None:
return False
height_percentage = _get_height_percentage(coordinates, coordinate_system)
return height_percentage < threshold
def _split_by_paragraph(file_text: str) -> list[str]:
"""Split text into paragraphs."""
return re.split(PARAGRAPH_PATTERN, file_text.strip())