"""Provides operations related to the HTML table stored in `.metadata.text_as_html`.
Used during partitioning as well as chunking.
"""
from __future__ import annotations
import html
from typing import TYPE_CHECKING, Iterator, Sequence, cast
from lxml import etree
from lxml.html import fragment_fromstring
from unstructured.utils import lazyproperty
if TYPE_CHECKING:
from lxml.html import HtmlElement
def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str:
"""Form an HTML table from "rows" and "columns" of `matrix`.
Character overhead is minimized:
- No whitespace padding is added for human readability
- No newlines ("\n") are added
- No `<thead>`, `<tbody>`, or `<tfoot>` elements are used; we can't tell where those might be
semantically appropriate anyway so at best they would consume unnecessary space and at worst
would be misleading.
"""
def iter_trs(rows_of_cell_strs: Sequence[Sequence[str]]) -> Iterator[str]:
for row_cell_strs in rows_of_cell_strs:
# -- suppress emission of rows with no cells --
if not row_cell_strs:
continue
yield f"<tr>{''.join(iter_tds(row_cell_strs))}</tr>"
def iter_tds(row_cell_strs: Sequence[str]) -> Iterator[str]:
for s in row_cell_strs:
# -- take care of things like '<' and '>' in the text --
s = html.escape(s)
# -- substitute <br/> elements for line-feeds in the text --
s = "<br/>".join(s.split("\n"))
# -- normalize whitespace in cell --
cell_text = " ".join(s.split())
# -- emit void `<td/>` when cell text is empty string --
yield f"<td>{cell_text}</td>" if cell_text else "<td/>"
return f"<table>{''.join(iter_trs(matrix))}</table>" if matrix else ""
class HtmlTable:
"""A `<table>` element."""
def __init__(self, table: HtmlElement):
self._table = table
@classmethod
def from_html_text(cls, html_text: str) -> HtmlTable:
# -- root is always a `<table>` element so far but let's be robust --
root = fragment_fromstring(html_text)
tables = root.xpath("//table")
if not tables:
raise ValueError("`html_text` contains no `<table>` element")
table = tables[0]
# -- remove `<thead>`, `<tbody>`, and `<tfoot>` noise elements when present --
noise_elements = table.xpath(".//thead | .//tbody | .//tfoot")
for e in noise_elements:
e.drop_tag()
# -- normalize and compactify the HTML --
for e in table.iter():
# -- Strip all attributes from elements, like border="1", class="dataframe" added
# -- by pandas.DataFrame.to_html(), style="text-align: right;", etc.
e.attrib.clear()
# -- change any `<th>` elements to `<td>` so all cells have the same tag --
if e.tag == "th":
e.tag = "td"
# -- normalize whitespace in element text; this removes indent whitespace before nested
# -- elements and reduces whitespace between words to a single space.
if e.text:
e.text = " ".join(e.text.split())
# -- remove all tails, those are newline + indent if anything --
if e.tail:
e.tail = None
return cls(table)
@lazyproperty
def html(self) -> str:
"""The HTML-fragment for this `<table>` element, all on one line.
Like: `<table><tr><td>foo</td></tr><tr><td>bar</td></tr></table>`
The HTML contains no human-readability whitespace, attributes, or `<thead>`, `<tbody>`, or
`<tfoot>` tags. It is made as compact as possible to maximize the semantic content in a
given space. This is particularly important for chunking.
"""
return etree.tostring(self._table, encoding=str)
def iter_rows(self) -> Iterator[HtmlRow]:
yield from (HtmlRow(tr) for tr in cast("list[HtmlElement]", self._table.xpath("./tr")))
@lazyproperty
def text(self) -> str:
"""The clean, concatenated, text for this table."""
table_text = " ".join(self._table.itertext())
# -- blank cells will introduce extra whitespace, so normalize after accumulating --
return " ".join(table_text.split())
class HtmlRow:
"""A `<tr>` element."""
def __init__(self, tr: HtmlElement):
self._tr = tr
@lazyproperty
def html(self) -> str:
"""Like "<tr><td>foo</td><td>bar</td></tr>"."""
return etree.tostring(self._tr, encoding=str)
def iter_cells(self) -> Iterator[HtmlCell]:
for td in self._tr:
yield HtmlCell(td)
def iter_cell_texts(self) -> Iterator[str]:
"""Generate contents of each cell of this row as a separate string.
A cell that is empty or contains only whitespace does not generate a string.
"""
for td in self._tr:
if (text := td.text) is None:
continue
if not text:
continue
yield text
@lazyproperty
def text_len(self) -> int:
"""Length of the normalized text, as it would appear in `element.text`."""
return len(" ".join(self.iter_cell_texts()))
class HtmlCell:
"""A `<td>` element."""
def __init__(self, td: HtmlElement):
self._td = td
@lazyproperty
def html(self) -> str:
"""Like "<td>foo bar baz</td>"."""
return etree.tostring(self._td, encoding=str) if self.text else "<td/>"
@lazyproperty
def text(self) -> str:
"""Text inside `<td>` element, empty string when no text."""
if (text := self._td.text) is None:
return ""
return " ".join(text.strip().split())