import difflib from typing import Any, Dict, List import numpy as np import pandas as pd from unstructured_inference.models.eval import compare_contents_as_df class TableAlignment: def __init__(self, cutoff: float = 0.8): self.cutoff = cutoff @staticmethod def get_content_in_tables(table_data: List[List[Dict[str, Any]]]) -> List[str]: # Replace below docstring with google-style docstring """Extracts and concatenates the content of cells from each table in a list of tables. Args: table_data: A list of tables, each table being a list of cell data dictionaries. Returns: List of strings where each string represents the concatenated content of one table. """ return [" ".join([d["content"] for d in td if "content" in d]) for td in table_data] @staticmethod def get_table_level_alignment( predicted_table_data: List[List[Dict[str, Any]]], ground_truth_table_data: List[List[Dict[str, Any]]], ) -> List[int]: """Compares predicted table data with ground truth data to find the best matching table index for each predicted table. Args: predicted_table_data: A list of predicted tables. ground_truth_table_data: A list of ground truth tables. Returns: A list of indices indicating the best match in the ground truth for each predicted table. """ ground_truth_texts = TableAlignment.get_content_in_tables(ground_truth_table_data) matched_indices = [] for td in predicted_table_data: reference = TableAlignment.get_content_in_tables([td])[0] matches = difflib.get_close_matches(reference, ground_truth_texts, cutoff=0.1, n=1) matched_indices.append(ground_truth_texts.index(matches[0]) if matches else -1) return matched_indices @staticmethod def _zip_to_dataframe(table_data: List[Dict[str, Any]]) -> pd.DataFrame: df = pd.DataFrame(table_data, columns=["row_index", "col_index", "content"]) df = df.set_index("row_index") df["col_index"] = df["col_index"].astype(str) return df @staticmethod def get_element_level_alignment( predicted_table_data: List[List[Dict[str, Any]]], ground_truth_table_data: List[List[Dict[str, Any]]], matched_indices: List[int], cutoff: float = 0.8, ) -> Dict[str, float]: """Aligns elements of the predicted tables with the ground truth tables at the cell level. Args: predicted_table_data: A list of predicted tables. ground_truth_table_data: A list of ground truth tables. matched_indices: Indices of the best matching ground truth table for each predicted table. cutoff: The cutoff value for the close matches. Returns: A dictionary with column and row alignment accuracies. """ content_diff_cols = [] content_diff_rows = [] col_index_acc = [] row_index_acc = [] for idx, td in zip(matched_indices, predicted_table_data): if idx == -1: content_diff_cols.append(0) content_diff_rows.append(0) col_index_acc.append(0) row_index_acc.append(0) continue ground_truth_td = ground_truth_table_data[idx] # Get row and col content accuracy predict_table_df = TableAlignment._zip_to_dataframe(td) ground_truth_table_df = TableAlignment._zip_to_dataframe(ground_truth_td) table_content_diff = compare_contents_as_df( ground_truth_table_df.fillna(""), predict_table_df.fillna(""), ) content_diff_cols.append(table_content_diff["by_col_token_ratio"]) content_diff_rows.append(table_content_diff["by_row_token_ratio"]) aligned_element_col_count = 0 aligned_element_row_count = 0 total_element_count = 0 # Get row and col index accuracy ground_truth_td_contents_list = [gtd["content"].lower() for gtd in ground_truth_td] used_indices = set() indices_tuple_pairs = [] for td_ele in td: content = td_ele["content"].lower() row_index = td_ele["row_index"] col_idx = td_ele["col_index"] matches = difflib.get_close_matches( content, ground_truth_td_contents_list, cutoff=cutoff, n=1, ) # BUG FIX: the previous matched_idx will only output the first matched index if # the match has duplicates in the # ground_truth_td_contents_list, the current fix will output its correspondence idx # once matching is exhausted, it will go back search again the same fashion matching_indices = [] if matches != []: b_indices = [ i for i, b_string in enumerate(ground_truth_td_contents_list) if b_string == matches[0] and i not in used_indices ] if not b_indices: # If all indices are used, reset used_indices and use the first index used_indices.clear() b_indices = [ i for i, b_string in enumerate(ground_truth_td_contents_list) if b_string == matches[0] and i not in used_indices ] matching_index = b_indices[0] matching_indices.append(matching_index) used_indices.add(matching_index) else: matching_indices = [-1] matched_idx = matching_indices[0] if matched_idx >= 0: gt_row_index = ground_truth_td[matched_idx]["row_index"] gt_col_index = ground_truth_td[matched_idx]["col_index"] indices_tuple_pairs.append(((row_index, col_idx), (gt_row_index, gt_col_index))) for indices_tuple_pair in indices_tuple_pairs: if indices_tuple_pair[0][0] == indices_tuple_pair[1][0]: aligned_element_row_count += 1 if indices_tuple_pair[0][1] == indices_tuple_pair[1][1]: aligned_element_col_count += 1 total_element_count += 1 table_col_index_acc = 0 table_row_index_acc = 0 if total_element_count > 0: table_col_index_acc = round(aligned_element_col_count / total_element_count, 2) table_row_index_acc = round(aligned_element_row_count / total_element_count, 2) col_index_acc.append(table_col_index_acc) row_index_acc.append(table_row_index_acc) not_found_gt_table_indexes = [ id for id in range(len(ground_truth_table_data)) if id not in matched_indices ] for _ in not_found_gt_table_indexes: content_diff_cols.append(0) content_diff_rows.append(0) col_index_acc.append(0) row_index_acc.append(0) return { "col_index_acc": round(np.mean(col_index_acc), 2), "row_index_acc": round(np.mean(row_index_acc), 2), "col_content_acc": round(np.mean(content_diff_cols) / 100.0, 2), "row_content_acc": round(np.mean(content_diff_rows) / 100.0, 2), }