from __future__ import annotations from typing import IO, Any, Optional import pandas as pd from unstructured.chunking import add_chunking_strategy from unstructured.common.html_table import HtmlTable from unstructured.documents.elements import Element, ElementMetadata, Table from unstructured.file_utils.model import FileType from unstructured.partition.common.common import ( exactly_one, spooled_to_bytes_io_if_needed, ) from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date DETECTION_ORIGIN: str = "tsv" @apply_metadata(FileType.TSV) @add_chunking_strategy def partition_tsv( filename: Optional[str] = None, *, file: Optional[IO[bytes]] = None, include_header: bool = False, **kwargs: Any, ) -> list[Element]: """Partitions TSV files into document elements. Parameters ---------- filename A string defining the target filename path. file A file-like object using "rb" mode --> open(filename, "rb"). include_header Determines whether or not header info info is included in text and medatada.text_as_html. """ exactly_one(filename=filename, file=file) header = 0 if include_header else None if filename: dataframe = pd.read_csv(filename, sep="\t", header=header) else: assert file is not None # -- Note(scanny): `SpooledTemporaryFile` on Python<3.11 does not implement `.readable()` # -- which triggers an exception on `pd.DataFrame.read_csv()` call. f = spooled_to_bytes_io_if_needed(file) dataframe = pd.read_csv(f, sep="\t", header=header) html_table = HtmlTable.from_html_text( dataframe.to_html(index=False, header=include_header, na_rep="") ) metadata = ElementMetadata( filename=filename, last_modified=get_last_modified_date(filename) if filename else None, text_as_html=html_table.html, ) metadata.detection_origin = DETECTION_ORIGIN return [Table(text=html_table.text, metadata=metadata)]