from __future__ import annotations
from typing import Any, Dict, List
from bs4 import BeautifulSoup
from unstructured_inference.models.tables import cells_to_html
EMPTY_CELL = {
"row_index": "",
"col_index": "",
"content": "",
}
def _move_cells_for_spanned_cells(cells: List[Dict[str, Any]]):
"""Move cells to the right if spanned cells have an influence on the rendering.
Args:
cells: List of cells in the table in Deckerd format.
Returns:
List of cells in the table in Deckerd format with cells moved to the right if spanned.
"""
sorted_cells = sorted(cells, key=lambda x: (x["y"], x["x"]))
cells_occupied_by_spanned = set()
for cell in sorted_cells:
if cell["w"] > 1 or cell["h"] > 1:
for i in range(cell["y"], cell["y"] + cell["h"]):
for j in range(cell["x"], cell["x"] + cell["w"]):
if (i, j) != (cell["y"], cell["x"]):
cells_occupied_by_spanned.add((i, j))
while (cell["y"], cell["x"]) in cells_occupied_by_spanned:
cell_y, cell_x = cell["y"], cell["x"]
cells_to_the_right = [c for c in sorted_cells if c["y"] == cell_y and c["x"] >= cell_x]
for cell_to_move in cells_to_the_right:
cell_to_move["x"] += 1
cells_occupied_by_spanned.remove((cell_y, cell_x))
return sorted_cells
def html_table_to_deckerd(content: str) -> List[Dict[str, Any]]:
"""Convert html format to Deckerd table structure.
Args:
content: The html content with a table to extract.
Returns:
A list of dictionaries where each dictionary represents a cell in the table.
"""
soup = BeautifulSoup(content, "html.parser")
table = soup.find("table")
rows = table.findAll(["tr"])
table_data = []
for i, row in enumerate(rows):
cells = row.findAll(["th", "td"])
for j, cell_data in enumerate(cells):
cell = {
"y": i,
"x": j,
"w": int(cell_data.attrs.get("colspan", 1)),
"h": int(cell_data.attrs.get("rowspan", 1)),
"content": cell_data.text,
}
table_data.append(cell)
return _move_cells_for_spanned_cells(table_data)
def deckerd_table_to_html(cells: List[Dict[str, Any]]) -> str:
"""Convert Deckerd table structure to html format.
Args:
cells: List of dictionaries where each dictionary represents a cell in the table.
Returns:
A string with the html content of the table.
"""
transformer_cells = []
# determine which cells are in header. Consider row 0 as header
# but spans may make it larger
first_row_cells = [cell for cell in cells if cell["y"] == 0]
header_length = max(cell["w"] for cell in first_row_cells)
header_rows = set(range(header_length))
for cell in cells:
cell_data = {
"row_nums": list(range(cell["y"], cell["y"] + cell["h"])),
"column_nums": list(range(cell["x"], cell["x"] + cell["w"])),
"w": cell["w"],
"h": cell["h"],
"cell text": cell["content"],
"column header": cell["y"] in header_rows,
}
transformer_cells.append(cell_data)
# reuse the existing function to convert to HTML
table = cells_to_html(transformer_cells)
return table
def _convert_table_from_html(content: str) -> List[Dict[str, Any]]:
"""Convert html format to table structure. As a middle step it converts
html to the Deckerd format as it's more convenient to work with.
Args:
content: The html content with a table to extract.
Returns:
A list of dictionaries where each dictionary represents a cell in the table.
"""
deckerd_cells = html_table_to_deckerd(content)
return _convert_table_from_deckerd(deckerd_cells)
def _convert_table_from_deckerd(content: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Convert deckerd format to table structure.
Args:
content: The deckerd formatted content with a table to extract.
Returns:
A list of dictionaries where each dictionary represents a cell in the table.
"""
table_data = []
for table in content:
try:
cell_data = {
"row_index": table["y"],
"col_index": table["x"],
"content": table["content"],
}
except KeyError:
cell_data = EMPTY_CELL
except TypeError:
cell_data = EMPTY_CELL
table_data.append(cell_data)
return table_data
def _sort_table_cells(table_data: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
return sorted(table_data, key=lambda cell: (cell["row_index"], cell["col_index"]))
def extract_and_convert_tables_from_ground_truth(
file_elements: List[Dict[str, Any]],
) -> List[List[Dict[str, Any]]]:
"""Extracts and converts tables data to a structured format based on the specified table type.
Args:
file_elements: List of elements from the ground truth file.
Returns:
A list of tables with each table represented as a list of cell data dictionaries.
"""
ground_truth_table_data = []
for element in file_elements:
if "type" in element and element["type"] == "Table" and "text" in element:
try:
converted_data = _convert_table_from_deckerd(
element["text"],
)
ground_truth_table_data.append(_sort_table_cells(converted_data))
except Exception as e:
print(f"Error converting ground truth data: {e}")
ground_truth_table_data.append({})
return ground_truth_table_data
def extract_and_convert_tables_from_prediction(
file_elements: List[Dict[str, Any]], source_type: str = "html"
) -> List[List[Dict[str, Any]]]:
"""Extracts and converts table data to a structured format
Args:
file_elements: List of elements from the file.
source_type: 'cells' or 'html'. 'cells' refers to reading 'table_as_cells' field while
'html' is extracted from 'text_as_html'
Returns:
A list of tables with each table represented as a list of cell data dictionaries.
"""
source_type_to_extraction_strategies = {
"html": extract_cells_from_text_as_html,
"cells": extract_cells_from_table_as_cells,
}
if source_type not in source_type_to_extraction_strategies:
raise ValueError(
f'source_type {source_type} is not valid. Allowed source_types are "html" and "cells"'
)
extract_cells_fn = source_type_to_extraction_strategies[source_type]
fallback_extract_cells_fn = (
extract_cells_from_table_as_cells
if source_type == "cells"
else extract_cells_from_text_as_html
)
predicted_table_data = []
for element in file_elements:
if element.get("type") == "Table":
extracted_cells = extract_cells_fn(element)
if not extracted_cells:
extracted_cells = fallback_extract_cells_fn(element)
if extracted_cells:
sorted_cells = _sort_table_cells(extracted_cells)
predicted_table_data.append(sorted_cells)
return predicted_table_data
def extract_cells_from_text_as_html(element: Dict[str, Any]) -> List[Dict[str, Any]] | None:
"""Extracts and parse cells from "text_as_html" field in Element structure
Args:
element: Example element:
{
"type": "Table",
"metadata": {
"text_as_html": "<table>
<thead>
<tr>
<th>Month A.</th>
</tr>
</thead>
</tbody>
<tr>
<td>22</td><
</tr>
</tbody>
</table>"
}
}
Returns:
List of extracted cells in a format:
[
{
"row_index": 0,
"col_index": 0,
"content": "Month A.",
},
...,
]
"""
val = element["metadata"].get("text_as_html")
if not val or "<table>" not in val:
return None
predicted_cells = None
try:
predicted_cells = _convert_table_from_html(val)
except Exception as e:
print(f"Error converting Unstructured table data: {e}")
return predicted_cells
def extract_cells_from_table_as_cells(element: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Extracts and parse cells from "table_as_cells" field in Element structure
Args:
element: Example element:
{
"type": "Table",
"metadata": {
"table_as_cells": [{"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
{"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"}]
}
}
Returns:
List of extracted cells in a format:
[
{
"row_index": 0,
"col_index": 0,
"content": "Month A.",
},
...,
]
"""
predicted_cells = element["metadata"].get("table_as_cells")
converted_cells = None
if predicted_cells:
converted_cells = _convert_table_from_deckerd(predicted_cells)
return converted_cells