from __future__ import annotations import contextlib import csv from typing import IO, Any, Iterator import pandas as pd from unstructured.chunking import add_chunking_strategy from unstructured.common.html_table import HtmlTable from unstructured.documents.elements import Element, ElementMetadata, Table from unstructured.file_utils.model import FileType from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date from unstructured.utils import is_temp_file_path, lazyproperty DETECTION_ORIGIN: str = "csv" @apply_metadata(FileType.CSV) @add_chunking_strategy def partition_csv( filename: str | None = None, *, file: IO[bytes] | None = None, encoding: str | None = None, include_header: bool = False, infer_table_structure: bool = True, **kwargs: Any, ) -> list[Element]: """Partitions Microsoft Excel Documents in .csv format into its document elements. Parameters ---------- filename A string defining the target filename path. file A file-like object using "rb" mode --> open(filename, "rb"). encoding The encoding method used to decode the text input. If None, utf-8 will be used. include_header Determines whether or not header info info is included in text and medatada.text_as_html. infer_table_structure If True, any Table elements that are extracted will also have a metadata field named "text_as_html" where the table's text content is rendered into an html string. I.e., rows and cells are preserved. Whether True or False, the "text" field is always present in any Table element and is the text content of the table (no structure). """ ctx = _CsvPartitioningContext.load( file_path=filename, file=file, encoding=encoding, include_header=include_header, infer_table_structure=infer_table_structure, ) with ctx.open() as file: dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=encoding) html_table = HtmlTable.from_html_text( dataframe.to_html(index=False, header=include_header, na_rep="") ) metadata = ElementMetadata( filename=filename, last_modified=ctx.last_modified, text_as_html=html_table.html if infer_table_structure else None, ) # -- a CSV file becomes a single `Table` element -- return [Table(text=html_table.text, metadata=metadata, detection_origin=DETECTION_ORIGIN)] class _CsvPartitioningContext: """Encapsulates the partitioning-run details. Provides access to argument values and especially encapsulates computation of values derived from those values so they don't obscure the core partitioning logic. """ def __init__( self, file_path: str | None = None, file: IO[bytes] | None = None, encoding: str | None = None, include_header: bool = False, infer_table_structure: bool = True, ): self._file_path = file_path self._file = file self._encoding = encoding self._include_header = include_header self._infer_table_structure = infer_table_structure @classmethod def load( cls, file_path: str | None, file: IO[bytes] | None, encoding: str | None, include_header: bool, infer_table_structure: bool, ) -> _CsvPartitioningContext: return cls( file_path=file_path, file=file, encoding=encoding, include_header=include_header, infer_table_structure=infer_table_structure, )._validate() @lazyproperty def delimiter(self) -> str | None: """The CSV delimiter, nominally a comma ",". `None` for a single-column CSV file which naturally has no delimiter. """ sniffer = csv.Sniffer() num_bytes = 65536 with self.open() as file: # -- read whole lines, sniffer can be confused by a trailing partial line -- data = "\n".join( ln.decode(self._encoding or "utf-8") for ln in file.readlines(num_bytes) ) try: return sniffer.sniff(data, delimiters=",;").delimiter except csv.Error: # -- sniffing will fail on single-column csv as no default can be assumed -- return None @lazyproperty def header(self) -> int | None: """Identifies the header row, if any, to Pandas, by idx.""" return 0 if self._include_header else None @lazyproperty def last_modified(self) -> str | None: """The best last-modified date available, None if no sources are available.""" return ( None if not self._file_path or is_temp_file_path(self._file_path) else get_last_modified_date(self._file_path) ) @contextlib.contextmanager def open(self) -> Iterator[IO[bytes]]: """Encapsulates complexity of dealing with file-path or file-like-object. Provides an `IO[bytes]` object as the "common-denominator" document source. Must be used as a context manager using a `with` statement: with self._file as file: do things with file File is guaranteed to be at read position 0 when called. """ if self._file_path: with open(self._file_path, "rb") as f: yield f else: file = self._file assert file is not None # -- guaranteed by `._validate()` -- # -- Be polite on principle. Reset file-pointer both before and after use -- file.seek(0) yield file file.seek(0) def _validate(self) -> _CsvPartitioningContext: """Raise on invalid argument values.""" if self._file_path is None and self._file is None: raise ValueError("either file-path or file-like object must be provided") return self