Source code for mmeval.metrics.voc_map

# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import warnings
from multiprocessing.pool import Pool
from typing import Dict, List, Optional, Sequence, Tuple, Union

from mmeval.core.base_metric import BaseMetric
from mmeval.metrics.utils import calculate_bboxes_area, calculate_overlaps
from mmeval.utils import is_list_of


def calculate_average_precision(recalls: np.ndarray,
                                precisions: np.ndarray,
                                mode: str = 'area') -> float:
    """Calculate average precision in the detection task.

    Args:
        recalls (ndarray): The recalls with shape (num_dets, ).
        precisions (ndarray): The precisions with shape (num_dets, ).
        mode (str): The average mode, should be 'area' or '11points'.
            'area' means calculating the area under precision-recall curve.
            '11points' means calculating the average precision of recalls at
            [0, 0.1, ..., 1.0]. Defaults to 'area'.

    Returns:
        float: Calculated average precision.
    """
    assert mode in ['area', '11points']
    if mode == 'area':
        mrec = np.hstack((0, recalls, 1))
        mpre = np.hstack((0, precisions, 0))
        for i in range(mpre.shape[0] - 1, 0, -1):
            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
        ind = np.where(mrec[1:] != mrec[:-1])[0]
        ap = np.sum((mrec[ind + 1] - mrec[ind]) * mpre[ind + 1])
    else:
        ap = 0.0
        for thr in np.arange(0, 1 + 1e-3, 0.1):
            precs = precisions[recalls >= thr]
            prec = precs.max() if precs.size > 0 else 0
            ap += prec
        ap /= 11.0
    return ap


def filter_by_bboxes_area(bboxes: np.ndarray,
                          min_area: Optional[float],
                          max_area: Optional[float],
                          use_legacy_coordinate=False) -> np.ndarray:
    """Filter the bboxes with an area range.

    Args:
        bboxes (numpy.ndarray): The bboxes with shape (n, 4) in 'xyxy' format.
        min_area (Optional[float]): The minimum area. If None, does not filter
            the minimum area.
        max_area (Optional[float]): The maximum area. If None, does not filter
            the maximum area.
        use_legacy_coordinate (bool): Whether to use coordinate system in
            mmdet v1.x. which means width, height should be
            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
            Note when function is used in `VOCDataset`, it should be
            True to align with the official implementation
            `http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar`
            Defaults to False.

    Returns:
        numpy.ndarray: A mask of ``bboxes`` identify which bbox are filtered.
    """
    bboxes_area = calculate_bboxes_area(bboxes, use_legacy_coordinate)
    area_mask = np.ones_like(bboxes_area, dtype=bool)
    if min_area is not None:
        area_mask &= (bboxes_area >= min_area)
    if max_area is not None:
        area_mask &= (bboxes_area < max_area)
    return area_mask


[docs]class VOCMeanAP(BaseMetric):
    """Pascal VOC evaluation metric.

    This metric computes the VOC mAP (mean Average Precision) with the given IoU
    thresholds and scale ranges.

    Args:
        iou_thrs (float ｜ List[float]): IoU thresholds. Defaults to 0.5.
        scale_ranges (List[tuple], optional): Scale ranges for evaluating
            mAP. If not specified, all bounding boxes would be included in
            evaluation. Defaults to None.
        num_classes (int, optional): The number of classes. If None, it will be
            obtained from the 'classes' field in ``self.dataset_meta``.
            Defaults to None.
        eval_mode (str): 'area' or '11points', 'area' means calculating the
            area under precision-recall curve, '11points' means calculating
            the average precision of recalls at [0, 0.1, ..., 1].
            The PASCAL VOC2007 defaults to use '11points', while PASCAL
            VOC2012 defaults to use 'area'.
            Defaults to 'area'.
        use_legacy_coordinate (bool): Whether to use coordinate
            system in mmdet v1.x. which means width, height should be
            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
            Defaults to False.
        nproc (int): Processes used for computing TP and FP. If nproc
            is less than or equal to 1, multiprocessing will not be used.
            Defaults to 4.
        drop_class_ap (bool): Whether to drop the class without ground truth
            when calculating the average precision for each class.
        classwise (bool): Whether to return the computed results of each
            class. Defaults to False.
        **kwargs: Keyword parameters passed to :class:`BaseMetric`.

    Examples:
        >>> import numpy as np
        >>> from mmeval import VOCMeanAP
        >>> num_classes = 4
        >>> voc_map = VOCMeanAP(num_classes=4)
        >>>
        >>> def _gen_bboxes(num_bboxes, img_w=256, img_h=256):
        ...     # random generate bounding boxes in 'xyxy' formart.
        ...     x = np.random.rand(num_bboxes, ) * img_w
        ...     y = np.random.rand(num_bboxes, ) * img_h
        ...     w = np.random.rand(num_bboxes, ) * (img_w - x)
        ...     h = np.random.rand(num_bboxes, ) * (img_h - y)
        ...     return np.stack([x, y, x + w, y + h], axis=1)
        >>>
        >>> prediction = {
        ...     'bboxes': _gen_bboxes(10),
        ...     'scores': np.random.rand(10, ),
        ...     'labels': np.random.randint(0, num_classes, size=(10, ))
        ... }
        >>> groundtruth = {
        ...     'bboxes': _gen_bboxes(10),
        ...     'labels': np.random.randint(0, num_classes, size=(10, )),
        ...     'bboxes_ignore': _gen_bboxes(5),
        ...     'labels_ignore': np.random.randint(0, num_classes, size=(5, ))
        ... }
        >>> voc_map(predictions=[prediction, ], groundtruths=[groundtruth, ])  # doctest: +ELLIPSIS  # noqa: E501
        {'AP50': ..., 'mAP': ...}
    """

    def __init__(self,
                 iou_thrs: Union[float, List[float]] = 0.5,
                 scale_ranges: Optional[List[Tuple]] = None,
                 num_classes: Optional[int] = None,
                 eval_mode: str = 'area',
                 use_legacy_coordinate: bool = False,
                 nproc: int = 4,
                 drop_class_ap: bool = True,
                 classwise: bool = False,
                 **kwargs) -> None:
        super().__init__(**kwargs)

        if isinstance(iou_thrs, float):
            iou_thrs = [iou_thrs]
        assert is_list_of(iou_thrs, float), \
            '`iou_thrs` should be float or a list of float'

        self.iou_thrs = iou_thrs

        if scale_ranges is None:
            scale_ranges = [(None, None)]
        elif (None, None) not in scale_ranges:
            # We allawys compute the mAP across all scale.
            scale_ranges.append((None, None))
        self.scale_ranges = scale_ranges

        area_ranges = []
        for min_scale, max_scale in self.scale_ranges:
            min_area = min_scale if min_scale is None else min_scale**2
            max_area = max_scale if max_scale is None else max_scale**2
            area_ranges.append((min_area, max_area))
        self._area_ranges = area_ranges

        self._num_classes = num_classes

        assert eval_mode in ['area', '11points'], \
            'Unrecognized mode, only "area" and "11points" are supported'
        self.eval_mode = eval_mode

        self.nproc = nproc
        self.use_legacy_coordinate = use_legacy_coordinate
        self.drop_class_ap = drop_class_ap
        self.classwise = classwise

        self.num_iou = len(self.iou_thrs)
        self.num_scale = len(self.scale_ranges)

    @property
    def num_classes(self) -> int:
        """Returns the number of classes.

        The number of classes should be set during initialization, otherwise it
        will be obtained from the 'classes' field in ``self.dataset_meta``.

        Returns:
            int: The number of classes.

        Raises:
            RuntimeError: If the num_classes is not set.
        """
        if self._num_classes is not None:
            return self._num_classes
        if self.dataset_meta and 'classes' in self.dataset_meta:
            self._num_classes = len(self.dataset_meta['classes'])
        elif self.dataset_meta and 'CLASSES' in self.dataset_meta:
            self._num_classes = len(self.dataset_meta['CLASSES'])
            warnings.warn(
                'The `CLASSES` in `dataset_meta` is deprecated, '
                'use `classes` instead!', DeprecationWarning)
        else:
            raise RuntimeError(
                "The `num_claases` is required, and also not found 'classes' "
                f'in dataset_meta: {self.dataset_meta}')
        return self._num_classes

[docs]    def add(self, predictions: Sequence[Dict], groundtruths: Sequence[Dict]) -> None:  # type: ignore # yapf: disable # noqa: E501
        """Add the intermediate results to ``self._results``.

        Args:
            predictions (Sequence[dict]): A sequence of dict. Each dict
                representing a detection result for an image, with the
                following keys:

                - bboxes (numpy.ndarray): Shape (N, 4), the predicted
                  bounding bboxes of this image, in 'xyxy' foramrt.
                - scores (numpy.ndarray): Shape (N, 1), the predicted scores
                  of bounding boxes.
                - labels (numpy.ndarray): Shape (N, 1), the predicted labels
                  of bounding boxes.

            groundtruths (Sequence[dict]): A sequence of dict. Each dict
                represents a groundtruths for an image, with the following
                keys:

                - bboxes (numpy.ndarray): Shape (M, 4), the ground truth
                  bounding bboxes of this image, in 'xyxy' foramrt.
                - labels (numpy.ndarray): Shape (M, 1), the ground truth
                  labels of bounding boxes.
                - bboxes_ignore (numpy.ndarray): Shape (K, 4), the ground
                  truth ignored bounding bboxes of this image,
                  in 'xyxy' foramrt.
                - labels_ignore (numpy.ndarray): Shape (K, 1), the ground
                  truth ignored labels of bounding boxes.
        """
        for prediction, groundtruth in zip(predictions, groundtruths):
            assert isinstance(prediction, dict), 'The prediciton should be ' \
                f'a sequence of dict, but got a sequence of {type(prediction)}.'  # noqa: E501
            assert isinstance(groundtruth, dict), 'The label should be ' \
                f'a sequence of dict, but got a sequence of {type(groundtruth)}.'  # noqa: E501
            self._results.append((prediction, groundtruth))

    @staticmethod
    def _calculate_image_tpfp(
            pred_bboxes: np.ndarray, gt_bboxes: np.ndarray,
            ignore_gt_bboxes: np.ndarray, iou_thrs: List[float],
            area_ranges: List[Tuple[Optional[float], Optional[float]]],
            use_legacy_coordinate: bool) -> Tuple[np.ndarray, np.ndarray]:
        """Calculate the true positive and false positive on an image.

        Args:
            pred_bboxes (numpy.ndarray): Predicted bboxes of this image, with
                shape (N, 5). The scores The predicted score of the bbox is
                concatenated behind the predicted bbox.
            gt_bboxes (numpy.ndarray): Ground truth bboxes of this image, with
                shape (M, 4).
            ignore_gt_bboxes (numpy.ndarray): Ground truth ignored bboxes of
                this image, with shape (K, 4).
            iou_thrs (List[float]): The IoU thresholds.
            area_ranges (List[Tuple]): The area ranges.
            use_legacy_coordinate (bool): Refer to :class:`VOCMeanAP`.

        Returns:
            tuple (tp, fp):

            - tp (numpy.ndarray): Shape (num_ious, num_scales, N),
              the true positive flag of each predicted bbox on this image.
            - fp (numpy.ndarray): Shape (num_ious, num_scales, N),
              the false positive flag of each predicted bbox on this image.

        Note:
            This method should be a staticmethod to avoid resource competition
            during multiple processes.
        """
        # Step 1. Concatenate `gt_bboxes` and `ignore_gt_bboxes`, then set
        # the `ignore_gt_flags`.
        all_gt_bboxes = np.concatenate((gt_bboxes, ignore_gt_bboxes))
        ignore_gt_flags = np.concatenate((np.zeros(
            (gt_bboxes.shape[0], 1),
            dtype=bool), np.ones((ignore_gt_bboxes.shape[0], 1), dtype=bool)))

        # Step 2. Initialize the `tp` and `fp` arrays.
        num_preds = pred_bboxes.shape[0]
        tp = np.zeros((len(iou_thrs), len(area_ranges), num_preds))
        fp = np.zeros((len(iou_thrs), len(area_ranges), num_preds))

        # Step 3. If there are no gt bboxes in this image, then all pred bboxes
        # within area range are false positives.
        if all_gt_bboxes.shape[0] == 0:
            for idx, (min_area, max_area) in enumerate(area_ranges):
                area_mask = filter_by_bboxes_area(pred_bboxes[:, :4], min_area,
                                                  max_area)
                fp[:, idx, area_mask] = 1
            return tp, fp

        # Step 4. Calculate the IoUs between the predicted bboxes and the
        # ground truth bboxes.
        ious = calculate_overlaps(
            pred_bboxes[:, :4],
            all_gt_bboxes,
            mode='iou',
            use_legacy_coordinate=use_legacy_coordinate)
        # For each pred bbox, the max iou with all gts.
        ious_max = ious.max(axis=1)
        # For each pred bbox, which gt overlaps most with it.
        ious_argmax = ious.argmax(axis=1)
        # Sort all pred bbox in descending order by scores.
        sorted_indices = np.argsort(-pred_bboxes[:, -1])

        # Step 5. Count the `tp` and `fp` of each iou threshold and area range.
        for iou_thr_idx, iou_thr in enumerate(iou_thrs):
            for area_idx, (min_area, max_area) in enumerate(area_ranges):
                # The flags that gt bboxes have been matched.
                gt_covered_flags = np.zeros(all_gt_bboxes.shape[0], dtype=bool)
                # The flags that gt bboxes out of area range.
                gt_area_mask = filter_by_bboxes_area(all_gt_bboxes, min_area,
                                                     max_area)
                ignore_gt_area_flags = ~gt_area_mask

                # Count the prediction bboxes in order of decreasing score.
                for pred_bbox_idx in sorted_indices:
                    if ious_max[pred_bbox_idx] >= iou_thr:
                        matched_gt_idx = ious_argmax[pred_bbox_idx]
                        # Ignore the pred bbox that match an ignored gt bbox.
                        if ignore_gt_flags[matched_gt_idx]:
                            continue
                        # Ignore the pred bbox that is out of area range.
                        if ignore_gt_area_flags[matched_gt_idx]:
                            continue
                        if not gt_covered_flags[matched_gt_idx]:
                            tp[iou_thr_idx, area_idx, pred_bbox_idx] = 1
                            gt_covered_flags[matched_gt_idx] = True
                        else:
                            # This gt bbox has been matched and counted as fp.
                            fp[iou_thr_idx, area_idx, pred_bbox_idx] = 1
                    else:
                        area_mask = filter_by_bboxes_area(
                            pred_bboxes[pred_bbox_idx, :4], min_area, max_area)
                        if area_mask:
                            fp[iou_thr_idx, area_idx, pred_bbox_idx] = 1

        return tp, fp

[docs]    def get_class_predictions(self, predictions: List[dict],
                              class_index: int) -> List:
        """Get prediciton results of a certain class index.

        Args:
            predictions (list[dict]): Same as :class:`VOCMeanAP.add`.
            class_index (int): Index of a specific class.

        Returns:
            list[np.ndarray]: A list of predicted bboxes of this class. Each
            predicted score of the bbox is concatenated behind the predicted
            bbox.
        """
        class_preds = []
        for pred in predictions:
            pred_indices = (pred['labels'] == class_index)
            pred_bboxes_info = np.concatenate(
                (pred['bboxes'][pred_indices, :],
                 pred['scores'][pred_indices].reshape((-1, 1))),
                axis=1)
            class_preds.append(pred_bboxes_info)
        return class_preds

[docs]    def get_class_gts(self, groundtruths: List[dict],
                      class_index: int) -> Tuple:
        """Get prediciton gt information of a certain class index.

        Args:
            groundtruths (list[dict]): Same as :class:`VOCMeanAP.add`.
            class_index (int): Index of a specific class.

        Returns:
            tuple (class_gts, class_ignore_gts):

            - class_gts (List[numpy.ndarray]): The gt bboxes of this class.
            - class_ignore_gts (List[numpy.ndarray]): The ignored gt bboxes of
              this class. This is necessary when counting tp and fp.
        """
        class_gts = []
        class_ignore_gts = []

        for gt in groundtruths:
            gt_indices = (gt['labels'] == class_index)
            gt_bboxes = gt['bboxes'][gt_indices, :]
            ignore_gt_indices = (gt['labels_ignore'] == class_index)
            ignore_gt_bboxes = gt['bboxes_ignore'][ignore_gt_indices, :]
            class_gts.append(gt_bboxes)
            class_ignore_gts.append(ignore_gt_bboxes)

        return class_gts, class_ignore_gts

[docs]    def calculate_class_tpfp(self, predictions: List[dict],
                             groundtruths: List[dict], class_index: int,
                             pool: Optional[Pool]) -> Tuple:
        """Calculate the tp and fp of the given class index.

        Args:
            predictions (List[dict]): A list of dict. Each dict is the
                detection result of an image. Same as :class:`VOCMeanAP.add`.
            groundtruths (List[dict]): A list of dict. Each dict is the
                ground truth of an image. Same as :class:`VOCMeanAP.add`.
            class_index (int): The class index.
            pool (Optional[Pool]): A instance of :class:`multiprocessing.Pool`.
                If None, do not use multiprocessing.

        Returns:
            tuple (tp, fp, num_gts):

            - tp (numpy.ndarray): Shape (num_ious, num_scales, num_pred),
              the true positive flag of each predicted bbox for this class.
            - fp (numpy.ndarray): Shape (num_ious, num_scales, num_pred),
              the false positive flag of each predicted bbox for this class.
            - num_gts (numpy.ndarray): Shape (num_ious, num_scales), the
              number of ground truths.
        """
        class_preds = self.get_class_predictions(predictions, class_index)
        class_gts, class_ignore_gts = self.get_class_gts(
            groundtruths, class_index)

        if pool is not None:
            num_images = len(class_preds)
            tpfp_list = pool.starmap(
                self._calculate_image_tpfp,
                zip(class_preds, class_gts, class_ignore_gts,
                    [self.iou_thrs] * num_images,
                    [self._area_ranges] * num_images,
                    [self.use_legacy_coordinate] * num_images))
        else:
            tpfp_list = []
            for img_idx in range(len(class_preds)):
                tpfp = self._calculate_image_tpfp(class_preds[img_idx],
                                                  class_gts[img_idx],
                                                  class_ignore_gts[img_idx],
                                                  self.iou_thrs,
                                                  self._area_ranges,
                                                  self.use_legacy_coordinate)
                tpfp_list.append(tpfp)

        image_tp_list, image_fp_list = tuple(zip(*tpfp_list))
        sorted_indices = np.argsort(-np.vstack(class_preds)[:, -1])
        tp = np.concatenate(image_tp_list, axis=2)[..., sorted_indices]
        fp = np.concatenate(image_fp_list, axis=2)[..., sorted_indices]

        num_gts = np.zeros((self.num_iou, self.num_scale), dtype=int)
        for idx, (min_area, max_area) in enumerate(self._area_ranges):
            area_mask = self._filter_by_bboxes_area(
                np.vstack(class_gts), min_area, max_area)
            num_gts[:, idx] = np.sum(area_mask)

        return tp, fp, num_gts

[docs]    def compute_metric(self, results: list) -> dict:
        """Compute the VOCMeanAP metric.

        Args:
            results (List[tuple]): A list of tuple. Each tuple is the
                prediction and ground truth of an image. This list has already
                been synced across all ranks.

        Returns:
            dict: The computed metric, with the following keys:

            - mAP, the averaged across all IoU thresholds and all class.
            - AP{IoU}, the mAP of the specified IoU threshold.
            - mAP@{scale_range}, the mAP of the specified scale range.
            - classwise, the evaluation results of each class.
              This would be returned if ``self.classwise`` is True.
        """
        predictions, groundtruths = zip(*results)

        nproc = min(self.nproc, len(predictions))
        if nproc > 1:
            pool = Pool(nproc)
        else:
            pool = None  # type: ignore

        results_per_class = []
        for class_index in range(self.num_classes):
            # Calculate tp, fp and num_gts.
            tp, fp, num_gts = self.calculate_class_tpfp(
                predictions, groundtruths, class_index, pool)

            # Calculate recalls and precisions.
            tp_cumsum = np.cumsum(tp, axis=2)
            fp_cumsum = np.cumsum(fp, axis=2)
            eps = np.finfo(np.float32).eps
            precisions = tp_cumsum / np.maximum((tp_cumsum + fp_cumsum), eps)
            recalls = tp_cumsum / np.maximum(num_gts[..., np.newaxis], eps)

            # Calculate average precision per `iou_thr` and `scale_range`.
            ap = np.zeros((self.num_iou, self.num_scale), dtype=np.float32)
            for i in range(self.num_iou):
                for j in range(self.num_scale):
                    ap[i, j] = calculate_average_precision(
                        recalls[i, j], precisions[i, j], self.eval_mode)

            results_per_class.append({
                'num_gts': num_gts,
                'num_dets': tp.shape[-1],
                'recalls': recalls,
                'precisions': precisions,
                'ap': ap,
            })

        if pool is not None:
            pool.close()

        eval_results = self._aggregate_results(results_per_class)
        if self.classwise:
            eval_results['classwise_result'] = results_per_class

        return eval_results

    def _aggregate_results(self, results_per_class: List[dict]) -> dict:
        """Aggregate class-wise results and return a dictionary.

        Args:
            results_per_class (List[dict]): The class-wise evaluate results.

        Returns:
            dict: The aggregated metric results, with the following keys:

            - mAP, the averaged across all IoU thresholds and all class.
            - AP{IoU}, the mAP of the specified IoU threshold.
            - mAP@{scale_range}, the mAP of the specified scale range.
        """
        eval_results = {}

        # Calculate `AP{iou_thr}` (while scale_range is None) for each
        # `iou_thrs`.
        for i, iou_thr in enumerate(self.iou_thrs):
            for j, scale_range in enumerate(self.scale_ranges):
                if scale_range != (None, None):
                    continue
                aps = [
                    res['ap'][i][j] for res in results_per_class
                    if res['num_gts'][i][j] > 0 or not self.drop_class_ap
                ]
                eval_results[f'AP{round(iou_thr * 100)}'] = np.array(
                    aps).mean().item()

        # Calculate `mAP@{scale_range}` and `mAP` (while scale_range is None)
        # overall `iou_thrs`.
        for j, scale_range in enumerate(self.scale_ranges):
            ap_per_ious = []
            for i in range(len(self.iou_thrs)):
                aps = [
                    res['ap'][i][j] for res in results_per_class
                    if res['num_gts'][i][j] > 0 or not self.drop_class_ap
                ]
                ap_per_ious.append(np.array(aps).mean().item())
            if scale_range == (None, None):
                key = 'mAP'
            else:
                key = f'mAP@{scale_range}'
            eval_results[key] = np.array(ap_per_ious).mean().item()

        return eval_results

    def _filter_by_bboxes_area(self, bboxes: np.ndarray,
                               min_area: Optional[float],
                               max_area: Optional[float]):
        """Filter the bboxes with an area range.

        Args:
            bboxes (numpy.ndarray): The bboxes with shape (n, 4) in
                'xyxy' format.
            min_area (Optional[float]): The minimum area. If None, does
                not filter the minimum area.
            max_area (Optional[float]): The maximum area. If None, does
                not filter the maximum area.

        Returns:
            numpy.ndarray: A mask of ``bboxes`` identify which bbox
            are filtered.
        """
        return filter_by_bboxes_area(bboxes, min_area, max_area,
                                     self.use_legacy_coordinate)