|
- # Copyright 2020 Tencent
- # SPDX-License-Identifier: BSD-3-Clause
-
- from math import sqrt
- import numpy as np
- import cv2
- import ncnn
- from .model_store import get_model_file
- from ..utils.functional import sigmoid, nms
-
-
- class Yolact:
- def __init__(
- self,
- target_size=550,
- confidence_threshold=0.05,
- nms_threshold=0.5,
- keep_top_k=200,
- num_threads=1,
- use_gpu=False,
- ):
- self.target_size = target_size
- self.confidence_threshold = confidence_threshold
- self.nms_threshold = nms_threshold
- self.keep_top_k = keep_top_k
- self.num_threads = num_threads
- self.use_gpu = use_gpu
-
- self.mean_vals = [123.68, 116.78, 103.94]
- self.norm_vals = [1.0 / 58.40, 1.0 / 57.12, 1.0 / 57.38]
-
- self.net = ncnn.Net()
- self.net.opt.use_vulkan_compute = self.use_gpu
- self.net.opt.num_threads = self.num_threads
-
- # original model converted from https://github.com/dbolya/yolact
- # yolact_resnet50_54_800000.pth
- # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
- self.net.load_param(get_model_file("yolact.param"))
- self.net.load_model(get_model_file("yolact.bin"))
-
- self.conv_ws = [69, 35, 18, 9, 5]
- self.conv_hs = [69, 35, 18, 9, 5]
- self.aspect_ratios = [1, 0.5, 2]
- self.scales = [24, 48, 96, 192, 384]
-
- self.priors = None
- self.last_img_size = None
-
- self.make_priors()
-
- self.class_names = [
- "background",
- "person",
- "bicycle",
- "car",
- "motorcycle",
- "airplane",
- "bus",
- "train",
- "truck",
- "boat",
- "traffic light",
- "fire hydrant",
- "stop sign",
- "parking meter",
- "bench",
- "bird",
- "cat",
- "dog",
- "horse",
- "sheep",
- "cow",
- "elephant",
- "bear",
- "zebra",
- "giraffe",
- "backpack",
- "umbrella",
- "handbag",
- "tie",
- "suitcase",
- "frisbee",
- "skis",
- "snowboard",
- "sports ball",
- "kite",
- "baseball bat",
- "baseball glove",
- "skateboard",
- "surfboard",
- "tennis racket",
- "bottle",
- "wine glass",
- "cup",
- "fork",
- "knife",
- "spoon",
- "bowl",
- "banana",
- "apple",
- "sandwich",
- "orange",
- "broccoli",
- "carrot",
- "hot dog",
- "pizza",
- "donut",
- "cake",
- "chair",
- "couch",
- "potted plant",
- "bed",
- "dining table",
- "toilet",
- "tv",
- "laptop",
- "mouse",
- "remote",
- "keyboard",
- "cell phone",
- "microwave",
- "oven",
- "toaster",
- "sink",
- "refrigerator",
- "book",
- "clock",
- "vase",
- "scissors",
- "teddy bear",
- "hair drier",
- "toothbrush",
- ]
-
- def __del__(self):
- self.net = None
-
- def __call__(self, img):
- img_h = img.shape[0]
- img_w = img.shape[1]
-
- mat_in = ncnn.Mat.from_pixels_resize(
- img,
- ncnn.Mat.PixelType.PIXEL_BGR2RGB,
- img_w,
- img_h,
- self.target_size,
- self.target_size,
- )
- mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)
-
- ex = self.net.create_extractor()
- ex.input("input.1", mat_in)
-
- ret1, proto_data = ex.extract("619") # 138x138 x 32
- ret2, loc_data = ex.extract("816") # 4 x 19248
- ret3, mask_data = ex.extract("818") # maskdim 32 x 19248
- ret4, conf_data = ex.extract("820") # 81 x 19248
-
- proto_data = np.array(proto_data)
- loc_data = np.array(loc_data)
- mask_data = np.array(mask_data)
- conf_data = np.array(conf_data)
- prior_data = self.make_priors()
-
- # decoded_boxes = self.decode(loc_data, prior_data)
- boxes, masks, classes, scores = self.detect(
- conf_data, loc_data, prior_data, mask_data, img_w, img_h
- )
-
- # generate mask
- masks = proto_data.transpose(1, 2, 0) @ masks.T
- masks = sigmoid(masks)
-
- # Scale masks up to the full image
- masks = cv2.resize(masks, (img_w, img_h), interpolation=cv2.INTER_LINEAR)
-
- # transpose into the correct output shape [num_dets, proto_h, proto_w]
- masks = masks.transpose(2, 0, 1)
-
- masks = masks > 0.5
-
- return boxes, masks, classes, scores
-
- def make_priors(self):
- """ Note that priors are [x,y,width,height] where (x,y) is the center of the box. """
- if self.last_img_size != (self.target_size, self.target_size):
- prior_data = []
-
- for conv_w, conv_h, scale in zip(self.conv_ws, self.conv_hs, self.scales):
- for i in range(conv_h):
- for j in range(conv_w):
- # +0.5 because priors are in center-size notation
- cx = (j + 0.5) / conv_w
- cy = (i + 0.5) / conv_h
-
- for ar in self.aspect_ratios:
- ar = sqrt(ar)
-
- w = scale * ar / self.target_size
- h = scale / ar / self.target_size
-
- # This is for backward compatibility with a bug where I made everything square by accident
- h = w
-
- prior_data += [cx, cy, w, h]
-
- self.priors = np.array(prior_data).reshape(-1, 4)
- self.last_img_size = (self.target_size, self.target_size)
-
- return self.priors
-
- def decode(self, loc, priors, img_w, img_h):
- """
- Decode predicted bbox coordinates using the same scheme
- employed by Yolov2: https://arxiv.org/pdf/1612.08242.pdf
-
- b_x = (sigmoid(pred_x) - .5) / conv_w + prior_x
- b_y = (sigmoid(pred_y) - .5) / conv_h + prior_y
- b_w = prior_w * exp(loc_w)
- b_h = prior_h * exp(loc_h)
-
- Note that loc is inputed as [(s(x)-.5)/conv_w, (s(y)-.5)/conv_h, w, h]
- while priors are inputed as [x, y, w, h] where each coordinate
- is relative to size of the image (even sigmoid(x)). We do this
- in the network by dividing by the 'cell size', which is just
- the size of the convouts.
-
- Also note that prior_x and prior_y are center coordinates which
- is why we have to subtract .5 from sigmoid(pred_x and pred_y).
-
- Args:
- - loc: The predicted bounding boxes of size [num_priors, 4]
- - priors: The priorbox coords with size [num_priors, 4]
-
- Returns: A tensor of decoded relative coordinates in point form
- form with size [num_priors, 4(x, y, w, h)]
- """
-
- variances = [0.1, 0.2]
-
- boxes = np.concatenate(
- (
- priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
- priors[:, 2:] * np.exp(loc[:, 2:] * variances[1]),
- ),
- 1,
- )
- boxes[:, :2] -= boxes[:, 2:] / 2
- # boxes[:, 2:] += boxes[:, :2]
-
- # crop
- np.where(boxes[:, 0] < 0, 0, boxes[:, 0])
- np.where(boxes[:, 1] < 0, 0, boxes[:, 1])
- np.where(boxes[:, 2] > 1, 1, boxes[:, 2])
- np.where(boxes[:, 3] > 1, 1, boxes[:, 3])
-
- # decode to img size
- boxes[:, 0] *= img_w
- boxes[:, 1] *= img_h
- boxes[:, 2] = boxes[:, 2] * img_w + 1
- boxes[:, 3] = boxes[:, 3] * img_h + 1
-
- return boxes
-
- def detect(self, conf_preds, loc_data, prior_data, mask_data, img_w, img_h):
- """ Perform nms for only the max scoring class that isn't background (class 0) """
- cur_scores = conf_preds[:, 1:]
- num_class = cur_scores.shape[1]
-
- classes = np.argmax(cur_scores, axis=1)
- conf_scores = cur_scores[range(cur_scores.shape[0]), classes]
-
- # filte by confidence_threshold
- keep = conf_scores > self.confidence_threshold
- conf_scores = conf_scores[keep]
- classes = classes[keep]
- loc_data = loc_data[keep, :]
- prior_data = prior_data[keep, :]
- masks = mask_data[keep, :]
-
- # decode x, y, w, h
- boxes = self.decode(loc_data, prior_data, img_w, img_h)
-
- # nms for every class
- boxes_result = []
- masks_result = []
- classes_result = []
- conf_scores_result = []
- for i in range(num_class):
- where = np.where(classes == i)
- if len(where) == 0:
- continue
-
- boxes_tmp = boxes[where]
- masks_tmp = masks[where]
- classes_tmp = classes[where]
- conf_scores_tmp = conf_scores[where]
-
- score_mask = conf_scores_tmp > self.confidence_threshold
- boxes_tmp = boxes_tmp[score_mask]
- masks_tmp = masks_tmp[score_mask]
- classes_tmp = classes_tmp[score_mask]
- conf_scores_tmp = conf_scores_tmp[score_mask]
-
- indexes = nms(
- boxes_tmp,
- conf_scores_tmp,
- iou_threshold=self.nms_threshold,
- top_k=self.keep_top_k,
- )
-
- for index in indexes:
- boxes_result.append(boxes_tmp[index])
- masks_result.append(masks_tmp[index])
- classes_result.append(classes_tmp[index] + 1)
- conf_scores_result.append(conf_scores_tmp[index])
-
- # keep top k
- if len(conf_scores_result) > self.keep_top_k:
- indexes = np.argsort(conf_scores_result)
- indexes = indexes[: self.keep_top_k]
-
- boxes_result = boxes_result[indexes]
- masks_result = masks_result[indexes]
- classes_result = classes_result[indexes]
- conf_scores_result = conf_scores_result[indexes]
-
- return (
- np.array(boxes_result),
- np.array(masks_result),
- np.array(classes_result),
- np.array(conf_scores_result),
- )
|