import numpy as np def group_gt_boxes_by_image_name(gt_boxes): gt_dict = {} for box in gt_boxes: #x = box.split() #image_name = x[0] #bbox = [float(z) for z in x[1:]] image_name = box[0] bbox = box[1:] #print(image_name, bbox) if image_name not in gt_dict.keys(): gt_dict[image_name] = [] gt_dict[image_name].append({'bbox': bbox}) return gt_dict def voc_ap(rec, prec, use_07_metric=False): """ ap = voc_ap(rec, prec, [use_07_metric]) Compute VOC AP given precision and recall. If use_07_metric is true, uses the VOC 07 11 point method (default:False). """ if use_07_metric: # 11 point metric ap = 0. for t in np.arange(0., 1.1, 0.1): if np.sum(rec >= t) == 0: p = 0 else: p = np.max(prec[rec >= t]) ap = ap + p / 11. else: # correct AP calculation # first append sentinel values at the end mrec = np.concatenate(([0.], rec, [1.])) mpre = np.concatenate(([0.], prec, [0.])) # compute the precision envelope for i in range(mpre.size - 1, 0, -1): mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) # to calculate area under PR curve, look for points # where X axis (recall) changes value i = np.where(mrec[1:] != mrec[:-1])[0] # and sum (\Delta recall) * prec ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) return ap def objectDetectionAP(dets, gts, ovthresh=0.5, use_07_metric=False): """ This function takes a list of ground truth boxes and a list of detected bounding boxes for a given class and computes the average precision of the detections with respect to the ground truth boxes. Parameters: ----------- dets: list List of bounding box detections. Each box is represented as a list with format: Case 1 (confidence provided): ['image_name', 'x_min', 'y_min', 'x_max', 'y_max', 'confidence'] Case 2 (confidence not provided): ['image_name', 'x_min', 'y_min', 'x_max', 'y_max'] gts: list List of ground truth boxes. Each box is represented as a list with the following format: [image_name, x_min, y_min, x_max, y_max]. [ovthresh]: float Overlap threshold (default = 0.5) [use_07_metric]: boolean Whether to use VOC07's 11 point AP computation (default False) Returns: -------- rec: 1d array-like Array where each element (rec[i]) is the recall when considering i+1 detections prec: 1d array-like Array where each element (rec[i]) is the precision when considering i+1 detections ap: float Average precision between detected boxes and the ground truth boxes. (it is also the area under the precision-recall curve). Example: With confidence scores: >> predictions_list = [['img_00285.png',330,463,387,505,0.0739], ['img_00285.png',420,433,451,498,0.0910], ['img_00285.png',328,465,403,540,0.1008], ['img_00285.png',480,477,508,522,0.1012], ['img_00285.png',357,460,417,537,0.1058], ['img_00285.png',356,456,391,521,0.0843], ['img_00225.png',345,460,415,547,0.0539], ['img_00225.png',381,362,455,513,0.0542], ['img_00225.png',382,366,416,422,0.0559], ['img_00225.png',730,463,763,583,0.0588]] >> ground_truth_list = [['img_00285.png',480,457,515,529], ['img_00285.png',480,457,515,529], ['img_00225.png',522,540,576,660], ['img_00225.png',739,460,768,545]] >> rec, prec, ap = objectDetectionAP(predictions_list, ground_truth_list) >> print(ap) 0.125 Without confidence scores: >> predictions_list = [['img_00285.png',330,463,387,505], ['img_00285.png',420,433,451,498], ['img_00285.png',328,465,403,540], ['img_00285.png',480,477,508,522], ['img_00285.png',357,460,417,537], ['img_00285.png',356,456,391,521], ['img_00225.png',345,460,415,547], ['img_00225.png',381,362,455,513], ['img_00225.png',382,366,416,422], ['img_00225.png',730,463,763,583]] >> ground_truth_list = [['img_00285.png',480,457,515,529], ['img_00285.png',480,457,515,529], ['img_00225.png',522,540,576,660], ['img_00225.png',739,460,768,545]] >> rec, prec, ap = objectDetectionAP(predictions_list, ground_truth_list) >> print(ap) 0.0625 """ # Load ground truth gt_dict = group_gt_boxes_by_image_name(gts) # extract gt objects for this class recs = {} npos = 0 imagenames = sorted(gt_dict.keys()) for imagename in imagenames: R = [obj for obj in gt_dict[imagename]] bbox = np.array([x['bbox'] for x in R]) det = [False] * len(R) npos = npos + len(R) recs[imagename] = {'bbox': bbox, 'det': det} # Load detections det_length = len(dets[0]) # Check that all boxes are the same size for det in dets: assert len(det) == det_length, 'Not all boxes have the same dimensions.' image_ids = [x[0] for x in dets] BB = np.array([[float(z) for z in x[1:5]] for x in dets]) if det_length == 6: print('confidence scores are present') confidence = np.array([float(x[-1]) for x in dets]) # sort by confidence sorted_ind = np.argsort(-confidence) sorted_scores = np.sort(-confidence) else: print('confidence scores are not present') num_dets = len(dets) sorted_ind = np.arange(num_dets) sorted_scores = np.ones(num_dets) BB = BB[sorted_ind, :] image_ids = [image_ids[x] for x in sorted_ind] # print('sorted_ind: ', sorted_ind) # print('sorted_scores: ', sorted_scores) # print('BB: ', BB) # print('image_ids: ', image_ids) # go down dets and mark TPs and FPs nd = len(image_ids) tp = np.zeros(nd) fp = np.zeros(nd) for d in range(nd): R = recs[image_ids[d]] bb = BB[d, :].astype(float) ovmax = -np.inf BBGT = R['bbox'].astype(float) # print('det %d: ' % d) # print('bb: ', bb) if BBGT.size > 0: # compute overlaps # intersection ixmin = np.maximum(BBGT[:, 0], bb[0]) iymin = np.maximum(BBGT[:, 1], bb[1]) ixmax = np.minimum(BBGT[:, 2], bb[2]) iymax = np.minimum(BBGT[:, 3], bb[3]) iw = np.maximum(ixmax - ixmin + 1., 0.) ih = np.maximum(iymax - iymin + 1., 0.) inters = iw * ih # union uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + (BBGT[:, 2] - BBGT[:, 0] + 1.) * (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) overlaps = inters / uni ovmax = np.max(overlaps) jmax = np.argmax(overlaps) # print('overlaps: ', overlaps) if ovmax > ovthresh: if not R['det'][jmax]: # print('Box matched!') tp[d] = 1. R['det'][jmax] = 1 else: # print('Box was already taken!') fp[d] = 1. else: # print('No match with sufficient overlap!') fp[d] = 1. # print('tp: ', tp) # print('fp: ', fp) # compute precision recall fp = np.cumsum(fp) tp = np.cumsum(tp) rec = tp / float(npos) # avoid divide by zero in case the first detection matches a difficult # ground truth prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) ap = voc_ap(rec, prec, use_07_metric) return rec, prec, ap