import numpy as np
import pandas as pd
from PIL import Image
from unstructured.partition.pdf import convert_pdf_to_images
from unstructured.partition.pdf_image.ocr import get_table_tokens
from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent
from unstructured.utils import requires_dependencies
@requires_dependencies("unstructured_inference")
def image_or_pdf_to_dataframe(filename: str) -> pd.DataFrame:
"""helper to JUST run table transformer on the input image/pdf file. It assumes the input is
JUST a table. This is intended to facilitate metric tracking on table structure detection ALONE
without mixing metric of element detection model"""
from unstructured_inference.models.tables import load_agent, tables_agent
load_agent()
if filename.endswith(".pdf"):
image = list(convert_pdf_to_images(filename))[0].convert("RGB")
else:
image = Image.open(filename).convert("RGB")
ocr_agent = OCRAgent.get_agent(language="eng")
return tables_agent.run_prediction(
image, ocr_tokens=get_table_tokens(image, ocr_agent), result_format="dataframe"
)
@requires_dependencies("unstructured_inference")
def eval_table_transformer_for_file(
filename: str,
true_table_filename: str,
eval_func: str = "token_ratio",
) -> float:
"""evaluate the predicted table structure vs. actual table structure by column and row as a
number between 0 and 1"""
from unstructured_inference.models.eval import compare_contents_as_df
pred_table = image_or_pdf_to_dataframe(filename).fillna("").replace(np.nan, "")
actual_table = pd.read_csv(true_table_filename).astype(str).fillna("").replace(np.nan, "")
results = np.array(
list(compare_contents_as_df(actual_table, pred_table, eval_func=eval_func).values()),
)
return results.mean() / 100.0