""" Implements object detection metrics: average precision, precision, recall, and f1 score. """ import json from dataclasses import dataclass from pathlib import Path import numpy as np import torch IOU_THRESHOLDS = torch.tensor( [0.5000, 0.5500, 0.6000, 0.6500, 0.7000, 0.7500, 0.8000, 0.8500, 0.9000, 0.9500] ) SCORE_THRESHOLD = 0.1 RECALL_THRESHOLDS = torch.arange(0, 1.01, 0.01) @dataclass class ObjectDetectionAggregatedEvaluation: """Class representing a gathered class-aggregated object detection metrics""" f1_score: float precision: float recall: float m_ap: float @dataclass class ObjectDetectionPerClassEvaluation: """Class representing a gathered object detection metrics per-class""" f1_score: dict[str, float] precision: dict[str, float] recall: dict[str, float] m_ap: dict[str, float] @classmethod def from_tensors(cls, ap, precision, recall, f1, class_labels): f1_score = {class_labels[i]: f1[i] for i in range(len(class_labels))} precision = {class_labels[i]: precision[i] for i in range(len(class_labels))} recall = {class_labels[i]: recall[i] for i in range(len(class_labels))} m_ap = {class_labels[i]: ap[i] for i in range(len(class_labels))} return cls(f1_score, precision, recall, m_ap) class ObjectDetectionEvalProcessor: iou_thresholds = IOU_THRESHOLDS score_threshold = SCORE_THRESHOLD recall_thresholds = RECALL_THRESHOLDS def __init__( self, document_preds: list[torch.Tensor], document_targets: list[torch.Tensor], pages_height: list[int], pages_width: list[int], class_labels: list[str], device: str = "cpu", ): """ Initializes the ObjectDetection prediction and ground truth. Args: document_preds (list): list (of length pages of document) of Tensors of shape (num_predictions, 6) format: (x1, y1, x2, y2, confidence,class_label) where x1,y1,x2,y2 are according to image size document_targets (list): list (of length pages of document) of Tensors of shape (num_targets, 6) format: (label, x1, y1, x2, y2) where x,y,w,h are according to image size pages_height (list): list of height of each page in the document pages_width (list): list of width of each page in the document class_labels (list): list of class labels """ self.device = device self.document_preds = [pred.to(device) for pred in document_preds] self.document_targets = [target.to(device) for target in document_targets] self.pages_height = pages_height self.pages_width = pages_width self.class_labels = class_labels @classmethod def from_json_files( cls, prediction_file_path: Path, ground_truth_file_path: Path, ) -> "ObjectDetectionEvalProcessor": """ Initializes the ObjectDetection prediction and ground truth, and converts the data to the required format. Args: prediction_file_path (Path): path to json file with predictions dump from OD model ground_truth_file_path (Path): path to json file with OD ground truth data """ # TODO: Test after https://unstructured-ai.atlassian.net/browse/ML-92 # is done. with open(prediction_file_path) as f: predictions_data = json.load(f) with open(ground_truth_file_path) as f: ground_truth_data = json.load(f) assert sorted(predictions_data["object_detection_classes"]) == sorted( ground_truth_data["object_detection_classes"] ), "Classes in predictions and ground truth do not match." assert len(predictions_data["pages"]) == len( ground_truth_data["pages"] ), "Pages number in predictions and ground truth do not match." for pred_page, gt_page in zip( sorted(predictions_data["pages"], key=lambda p: p["number"]), sorted(ground_truth_data["pages"], key=lambda p: p["number"]), ): assert pred_page["number"] == gt_page["number"], ( f"Page numbers in predictions {prediction_file_path.name} " f"({pred_page['number']}) and ground truth {ground_truth_file_path.name} " f"({gt_page['number']}) do not match." ) page_num = pred_page["number"] # TODO: translate the bboxes instead of raising error assert pred_page["size"] == gt_page["size"], ( f"Page sizes in predictions {prediction_file_path.name} " f"({pred_page['size'][0]} x {pred_page['size'][1]}) " f"and ground truth {ground_truth_file_path.name} ({gt_page['size'][0]} x " f"{gt_page['size'][1]}) do not match for page {page_num}." ) class_labels = predictions_data["object_detection_classes"] document_preds = cls._process_data(predictions_data, class_labels, prediction=True) document_targets = cls._process_data(ground_truth_data, class_labels) pages_height, pages_width = cls._parse_page_dimensions(predictions_data) return cls(document_preds, document_targets, pages_height, pages_width, class_labels) def get_metrics( self, ) -> tuple[ObjectDetectionAggregatedEvaluation, ObjectDetectionPerClassEvaluation]: """Get per document OD metrics. Returns: tuple: Tuple of ObjectDetectionAggregatedEvaluation and ObjectDetectionPerClassEvaluation """ document_matchings = [] for preds, targets, height, width in zip( self.document_preds, self.document_targets, self.pages_height, self.pages_width ): # iterate over each page page_matching_tensors = self._compute_page_detection_matching( preds=preds, targets=targets, height=height, width=width, ) document_matchings.append(page_matching_tensors) # compute metrics for all detections and targets mean_ap, mean_precision, mean_recall, mean_f1 = ( -1.0, -1.0, -1.0, -1.0, ) num_cls = len(self.class_labels) mean_ap_per_class = np.full(num_cls, np.nan) mean_precision_per_class = np.full(num_cls, np.nan) mean_recall_per_class = np.full(num_cls, np.nan) mean_f1_per_class = np.full(num_cls, np.nan) if len(document_matchings): matching_info_tensors = [torch.cat(x, 0) for x in list(zip(*document_matchings))] # shape (n_class, nb_iou_thresh) ( ap_per_present_classes, precision_per_present_classes, recall_per_present_classes, f1_per_present_classes, present_classes, ) = self._compute_detection_metrics( *matching_info_tensors, ) # Precision, recall and f1 are computed for IoU threshold range, averaged over classes # results before version 3.0.4 (Dec 11 2022) were computed only for smallest value # (i.e IoU 0.5 if metric is @0.5:0.95) mean_precision, mean_recall, mean_f1 = ( precision_per_present_classes.mean(), recall_per_present_classes.mean(), f1_per_present_classes.mean(), ) # MaP is averaged over IoU thresholds and over classes mean_ap = ap_per_present_classes.mean() # Fill array of per-class AP scores with values for classes that were present in the # dataset ap_per_class = ap_per_present_classes.mean(1) precision_per_class = precision_per_present_classes.mean(1) recall_per_class = recall_per_present_classes.mean(1) f1_per_class = f1_per_present_classes.mean(1) for i, class_index in enumerate(present_classes): mean_ap_per_class[class_index] = float(ap_per_class[i]) mean_precision_per_class[class_index] = float(precision_per_class[i]) mean_recall_per_class[class_index] = float(recall_per_class[i]) mean_f1_per_class[class_index] = float(f1_per_class[i]) od_per_class_evaluation = ObjectDetectionPerClassEvaluation.from_tensors( ap=mean_ap_per_class, precision=mean_precision_per_class, recall=mean_recall_per_class, f1=mean_f1_per_class, class_labels=self.class_labels, ) od_evaluation = ObjectDetectionAggregatedEvaluation( f1_score=float(mean_f1), precision=float(mean_precision), recall=float(mean_recall), m_ap=float(mean_ap), ) return od_evaluation, od_per_class_evaluation @staticmethod def _parse_page_dimensions(data: dict) -> tuple[list, list]: """ Process the page dimensions from the json file to the required format. """ pages_height = [] pages_width = [] for page in data["pages"]: pages_height.append(page["size"]["height"]) pages_width.append(page["size"]["width"]) return pages_height, pages_width @staticmethod def _process_data(data: dict, class_labels, prediction: bool = False) -> list[dict]: """ Process the elements from the json file to the required format. """ pages_list = [] for page in data["pages"]: page_elements = [] for element in page["elements"]: # Extract coordinates, confidence, and class label from each prediction class_label = element["type"] class_idx = class_labels.index(class_label) x1, y1, x2, y2 = element["bbox"] if prediction: confidence = element["prob"] page_elements.append([x1, y1, x2, y2, confidence, class_idx]) else: page_elements.append([class_idx, x1, y1, x2, y2]) page_tensor = torch.tensor(page_elements) pages_list.append(page_tensor) return pages_list @staticmethod def _get_top_k_idx_per_cls( preds_scores: torch.Tensor, preds_cls: torch.Tensor, top_k: int ) -> torch.Tensor: # From: https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/utils/detection_utils.py # noqa E501 """ Get the indexes of all the top k predictions for every class Args: preds_scores: The confidence scores, vector of shape (n_pred) preds_cls: The predicted class, vector of shape (n_pred) top_k: Number of predictions to keep per class, ordered by confidence score Returns: top_k_idx: Indexes of the top k predictions. length <= (k * n_unique_class) """ n_unique_cls = torch.max(preds_cls) mask = preds_cls.view(-1, 1) == torch.arange( n_unique_cls + 1, device=preds_scores.device ).view(1, -1) preds_scores_per_cls = preds_scores.view(-1, 1) * mask sorted_scores_per_cls, sorting_idx = preds_scores_per_cls.sort(0, descending=True) idx_with_satisfying_scores = sorted_scores_per_cls[:top_k, :].nonzero(as_tuple=False) top_k_idx = sorting_idx[idx_with_satisfying_scores.split(1, dim=1)] return top_k_idx.view(-1) @staticmethod def _change_bbox_bounds_for_image_size( boxes: np.ndarray, img_shape: tuple[int, int] ) -> np.ndarray: # From: https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/utils/detection_utils.py # noqa E501 """ Clips bboxes to image boundaries. Args: bboxes: Input bounding boxes in XYXY format of [..., 4] shape img_shape: Image shape (height, width). Returns: clipped_boxes: Clipped bboxes in XYXY format of [..., 4] shape """ boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(min=0, max=img_shape[1]) boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(min=0, max=img_shape[0]) return boxes @staticmethod def _box_iou(box1: torch.Tensor, box2: torch.Tensor) -> torch.Tensor: # From: https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/utils/detection_utils.py # noqa E501 """ Return intersection-over-union (Jaccard index) of boxes. Both sets of boxes are expected to be in (x1, y1, x2, y2) format. Args: box1: Tensor of shape [N, 4] box2: Tensor of shape [M, 4] Returns: iou: Tensor of shape [N, M]: the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2 """ def box_area(box): # box = 4xn return (box[2] - box[0]) * (box[3] - box[1]) area1 = box_area(box1.T) area2 = box_area(box2.T) # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2) inter = ( (torch.min(box1[:, None, 2:], box2[:, 2:]) - torch.max(box1[:, None, :2], box2[:, :2])) .clamp(0) .prod(2) ) return inter / (area1[:, None] + area2 - inter) # iou = inter / (area1 + area2 - inter) def _compute_targets( self, preds_box_xyxy: torch.Tensor, preds_cls: torch.Tensor, targets_box_xyxy: torch.Tensor, targets_cls: torch.Tensor, preds_matched: torch.Tensor, targets_matched: torch.Tensor, preds_idx_to_use: torch.Tensor, iou_thresholds: torch.Tensor, ) -> torch.Tensor: # From: https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/utils/detection_utils.py # noqa E501 """ Computes the matching targets based on IoU for regular scenarios. Args: preds_box_xyxy: (torch.Tensor) Predicted bounding boxes in XYXY format. preds_cls: (torch.Tensor) Predicted classes. targets_box_xyxy: (torch.Tensor) Target bounding boxes in XYXY format. targets_cls: (torch.Tensor) Target classes. preds_matched: (torch.Tensor) Tensor indicating which predictions are matched. targets_matched: (torch.Tensor) Tensor indicating which targets are matched. preds_idx_to_use: (torch.Tensor) Indices of predictions to use. Returns: targets: Computed matching targets. """ # shape = (n_preds x n_targets) iou = self._box_iou(preds_box_xyxy[preds_idx_to_use], targets_box_xyxy) # Fill IoU values at index (i, j) with 0 when the prediction (i) and target(j) # are of different class # Filling with 0 is equivalent to ignore these values # since with want IoU > iou_threshold > 0 cls_mismatch = preds_cls[preds_idx_to_use].view(-1, 1) != targets_cls.view(1, -1) iou[cls_mismatch] = 0 # The matching priority is first detection confidence and then IoU value. # The detection is already sorted by confidence in NMS, # so here for each prediction we order the targets by iou. sorted_iou, target_sorted = iou.sort(descending=True, stable=True) # Only iterate over IoU values higher than min threshold to speed up the process for pred_selected_i, target_sorted_i in (sorted_iou > iou_thresholds[0]).nonzero( as_tuple=False ): # pred_selected_i and target_sorted_i are relative to filters/sorting, # so we extract their absolute indexes pred_i = preds_idx_to_use[pred_selected_i] target_i = target_sorted[pred_selected_i, target_sorted_i] # Vector[j], True when IoU(pred_i, target_i) is above the (j)th threshold is_iou_above_threshold = sorted_iou[pred_selected_i, target_sorted_i] > iou_thresholds # Vector[j], True when both pred_i and target_i are not matched yet # for the (j)th threshold are_candidates_free = torch.logical_and( ~preds_matched[pred_i, :], ~targets_matched[target_i, :] ) # Vector[j], True when (pred_i, target_i) can be matched for the (j)th threshold are_candidates_good = torch.logical_and(is_iou_above_threshold, are_candidates_free) # For every threshold (j) where target_i and pred_i can be matched together # ( are_candidates_good[j]==True ) # fill the matching placeholders with True targets_matched[target_i, are_candidates_good] = True preds_matched[pred_i, are_candidates_good] = True # When all the targets are matched with a prediction for every IoU Threshold, stop. if targets_matched.all(): break return preds_matched def _compute_page_detection_matching( self, preds: torch.Tensor, targets: torch.Tensor, height: int, width: int, top_k: int = 100, return_on_cpu: bool = True, ) -> tuple: # Adapted from: https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/utils/detection_utils.py # noqa E501 """ Match predictions (NMS output) and the targets (ground truth) with respect to metric and confidence score for a given image. Args: preds: Tensor of shape (num_img_predictions, 6) format: (x1, y1, x2, y2, confidence, class_label) where x1,y1,x2,y2 are according to image size targets: targets for this image of shape (num_img_targets, 5) format: (label, x1, y1, x2, y2) where x1,y1,x2,y2 are according to image size height: dimensions of the image width: dimensions of the image top_k: Number of predictions to keep per class, ordered by confidence score return_on_cpu: If True, the output will be returned on "CPU", otherwise it will be returned on "device" Returns: preds_matched: Tensor of shape (num_img_predictions, n_thresholds) True when prediction (i) is matched with a target with respect to the (j)th threshold preds_to_ignore: Tensor of shape (num_img_predictions, n_thresholds) True when prediction (i) is matched with a crowd target with respect to the (j)th threshold preds_scores: Tensor of shape (num_img_predictions), confidence score for every prediction preds_cls: Tensor of shape (num_img_predictions), predicted class for every prediction targets_cls: Tensor of shape (num_img_targets), ground truth class for every target """ thresholds = self.iou_thresholds.to(device=self.device) num_thresholds = len(thresholds) if preds is None or len(preds) == 0: preds_matched = torch.zeros((0, num_thresholds), dtype=torch.bool, device=self.device) preds_to_ignore = torch.zeros((0, num_thresholds), dtype=torch.bool, device=self.device) preds_scores = torch.tensor([], dtype=torch.float32, device=self.device) preds_cls = torch.tensor([], dtype=torch.float32, device=self.device) targets_cls = targets[:, 0].to(device=self.device) return preds_matched, preds_to_ignore, preds_scores, preds_cls, targets_cls preds_matched = torch.zeros( len(preds), num_thresholds, dtype=torch.bool, device=self.device ) targets_matched = torch.zeros( len(targets), num_thresholds, dtype=torch.bool, device=self.device ) preds_to_ignore = torch.zeros( len(preds), num_thresholds, dtype=torch.bool, device=self.device ) preds_cls, preds_box, preds_scores = preds[:, -1], preds[:, 0:4], preds[:, 4] targets_cls, targets_box = targets[:, 0], targets[:, 1:5] # Ignore all but the predictions that were top_k for their class preds_idx_to_use = self._get_top_k_idx_per_cls(preds_scores, preds_cls, top_k) preds_to_ignore[:, :] = True preds_to_ignore[preds_idx_to_use] = False if len(targets) > 0: # or len(crowd_targets) > 0: self._change_bbox_bounds_for_image_size(preds, (height, width)) preds_matched = self._compute_targets( preds_box, preds_cls, targets_box, targets_cls, preds_matched, targets_matched, preds_idx_to_use, thresholds, ) return preds_matched, preds_to_ignore, preds_scores, preds_cls, targets_cls def _compute_detection_metrics( self, preds_matched: torch.Tensor, preds_to_ignore: torch.Tensor, preds_scores: torch.Tensor, preds_cls: torch.Tensor, targets_cls: torch.Tensor, ) -> tuple: # Adapted from: https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/utils/detection_utils.py # noqa E501 """ Compute the list of precision, recall, MaP and f1 for every class. Args: preds_matched: Tensor of shape (num_predictions, n_iou_thresholds) True when prediction (i) is matched with a target with respect to the (j)th IoU threshold preds_to_ignore Tensor of shape (num_predictions, n_iou_thresholds) True when prediction (i) is matched with a crowd target with respect to the (j)th IoU threshold preds_scores: Tensor of shape (num_predictions), confidence score for every prediction preds_cls: Tensor of shape (num_predictions), predicted class for every prediction targets_cls: Tensor of shape (num_targets), ground truth class for every target box to be detected Returns: ap, precision, recall, f1: Tensors of shape (n_class, nb_iou_thrs) unique_classes: Vector with all unique target classes """ preds_matched, preds_to_ignore = preds_matched.to(self.device), preds_to_ignore.to( self.device ) preds_scores, preds_cls, targets_cls = ( preds_scores.to(self.device), preds_cls.to(self.device), targets_cls.to(self.device), ) recall_thresholds = self.recall_thresholds.to(self.device) score_threshold = self.score_threshold unique_classes = torch.unique(targets_cls).long() n_class, nb_iou_thrs = len(unique_classes), preds_matched.shape[-1] ap = torch.zeros((n_class, nb_iou_thrs), device=self.device) precision = torch.zeros((n_class, nb_iou_thrs), device=self.device) recall = torch.zeros((n_class, nb_iou_thrs), device=self.device) for cls_i, class_value in enumerate(unique_classes): cls_preds_idx, cls_targets_idx = (preds_cls == class_value), ( targets_cls == class_value ) ( cls_ap, cls_precision, cls_recall, ) = self._compute_detection_metrics_per_cls( preds_matched=preds_matched[cls_preds_idx], preds_to_ignore=preds_to_ignore[cls_preds_idx], preds_scores=preds_scores[cls_preds_idx], n_targets=cls_targets_idx.sum(), recall_thresholds=recall_thresholds, score_threshold=score_threshold, ) ap[cls_i, :] = cls_ap precision[cls_i, :] = cls_precision recall[cls_i, :] = cls_recall f1 = 2 * precision * recall / (precision + recall + 1e-16) return ap, precision, recall, f1, unique_classes def _compute_detection_metrics_per_cls( self, preds_matched: torch.Tensor, preds_to_ignore: torch.Tensor, preds_scores: torch.Tensor, n_targets: int, recall_thresholds: torch.Tensor, score_threshold: float, ): # Adapted from: https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/utils/detection_utils.py # noqa E501 """ Compute the list of precision, recall and MaP of a given class for every recall threshold. Args: preds_matched: Tensor of shape (num_predictions, n_thresholds) True when prediction (i) is matched with a target with respect to the(j)th threshold preds_to_ignore Tensor of shape (num_predictions, n_thresholds) True when prediction (i) is matched with a crowd target with respect to the (j)th threshold preds_scores: Tensor of shape (num_predictions), confidence score for every prediction n_targets: Number of target boxes of this class recall_thresholds: Tensor of shape (max_n_rec_thresh) list of recall thresholds used to compute MaP score_threshold: Minimum confidence score to consider a prediction for the computation of precision and recall (not MaP) Returns: ap, precision, recall: Tensors of shape (nb_thrs) """ nb_iou_thrs = preds_matched.shape[-1] tps = preds_matched fps = torch.logical_and( torch.logical_not(preds_matched), torch.logical_not(preds_to_ignore) ) if len(tps) == 0: return ( torch.zeros(nb_iou_thrs, device=self.device), torch.zeros(nb_iou_thrs, device=self.device), torch.zeros(nb_iou_thrs, device=self.device), ) # Sort by decreasing score dtype = ( torch.uint8 if preds_scores.is_cuda and preds_scores.dtype is torch.bool else preds_scores.dtype ) sort_ind = torch.argsort(preds_scores.to(dtype), descending=True) tps = tps[sort_ind, :] fps = fps[sort_ind, :] preds_scores = preds_scores[sort_ind].contiguous() # Rolling sum over the predictions rolling_tps = torch.cumsum(tps, axis=0, dtype=torch.float) rolling_fps = torch.cumsum(fps, axis=0, dtype=torch.float) rolling_recalls = rolling_tps / n_targets rolling_precisions = rolling_tps / ( rolling_tps + rolling_fps + torch.finfo(torch.float64).eps ) # Reversed cummax to only have decreasing values rolling_precisions = rolling_precisions.flip(0).cummax(0).values.flip(0) # ================== # RECALL & PRECISION # We want the rolling precision/recall at index i so that: # preds_scores[i-1] >= score_threshold > preds_scores[i] # Note: torch.searchsorted works on increasing sequence and preds_scores is decreasing, # so we work with "-" # Note2: right=True due to negation lowest_score_above_threshold = torch.searchsorted( -preds_scores, -score_threshold, right=True ) if ( lowest_score_above_threshold == 0 ): # Here score_threshold > preds_scores[0], so no pred is above the threshold recall = torch.zeros(nb_iou_thrs, device=self.device) precision = torch.zeros( nb_iou_thrs, device=self.device ) # the precision is not really defined when no pred but we need to give it a value else: recall = rolling_recalls[lowest_score_above_threshold - 1] precision = rolling_precisions[lowest_score_above_threshold - 1] # ================== # AVERAGE PRECISION # shape = (nb_iou_thrs, n_recall_thresholds) recall_thresholds = recall_thresholds.view(1, -1).repeat(nb_iou_thrs, 1) # We want the index i so that: # rolling_recalls[i-1] < recall_thresholds[k] <= rolling_recalls[i] # Note: when recall_thresholds[k] > max(rolling_recalls), i = len(rolling_recalls) # Note2: we work with transpose (.T) to apply torch.searchsorted on first dim # instead of the last one recall_threshold_idx = torch.searchsorted( rolling_recalls.T.contiguous(), recall_thresholds, right=False ).T # When recall_thresholds[k] > max(rolling_recalls), # rolling_precisions[i] is not defined, and we want precision = 0 rolling_precisions = torch.cat( (rolling_precisions, torch.zeros(1, nb_iou_thrs, device=self.device)), dim=0 ) # shape = (n_recall_thresholds, nb_iou_thrs) sampled_precision_points = torch.gather( input=rolling_precisions, index=recall_threshold_idx, dim=0 ) # Average over the recall_thresholds ap = sampled_precision_points.mean(0) return ap, precision, recall if __name__ == "__main__": from dataclasses import asdict # Example usage prediction_file_paths = [Path("pths/to/predictions.json"), Path("pths/to/predictions2.json")] ground_truth_file_paths = [ Path("pths/to/ground_truth.json"), Path("pths/to/ground_truth2.json"), ] for prediction_file_path, ground_truth_file_path in zip( prediction_file_paths, ground_truth_file_paths ): eval_processor = ObjectDetectionEvalProcessor.from_json_files( prediction_file_path, ground_truth_file_path ) metrics, per_class_metrics = eval_processor.get_metrics() print(f"Metrics for {ground_truth_file_path.name}:\n{asdict(metrics)}") print(f"Per class Metrics for {ground_truth_file_path.name}:\n{asdict(per_class_metrics)}")