|
@@ -0,0 +1,270 @@
|
|
|
+import numpy as np
|
|
|
+import torch
|
|
|
+from torch import nn
|
|
|
+from torch.nn import functional as F
|
|
|
+from torchvision.ops import nms
|
|
|
+
|
|
|
+
|
|
|
+class BBoxUtility(object):
|
|
|
+ def __init__(self, num_classes):
|
|
|
+ self.num_classes = num_classes
|
|
|
+
|
|
|
+ def ssd_correct_boxes(self, box_xy, box_wh, input_shape, image_shape, letterbox_image):
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ box_yx = box_xy[..., ::-1]
|
|
|
+ box_hw = box_wh[..., ::-1]
|
|
|
+ input_shape = np.array(input_shape)
|
|
|
+ image_shape = np.array(image_shape)
|
|
|
+
|
|
|
+ if letterbox_image:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ new_shape = np.round(image_shape * np.min(input_shape / image_shape))
|
|
|
+ offset = (input_shape - new_shape) / 2. / input_shape
|
|
|
+ scale = input_shape / new_shape
|
|
|
+
|
|
|
+ box_yx = (box_yx - offset) * scale
|
|
|
+ box_hw *= scale
|
|
|
+
|
|
|
+ box_mins = box_yx - (box_hw / 2.)
|
|
|
+ box_maxes = box_yx + (box_hw / 2.)
|
|
|
+ boxes = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]],
|
|
|
+ axis=-1)
|
|
|
+ boxes *= np.concatenate([image_shape, image_shape], axis=-1)
|
|
|
+ return boxes
|
|
|
+
|
|
|
+ def decode_boxes(self, mbox_loc, anchors, variances):
|
|
|
+
|
|
|
+
|
|
|
+ anchor_width = anchors[:, 2] - anchors[:, 0]
|
|
|
+ anchor_height = anchors[:, 3] - anchors[:, 1]
|
|
|
+
|
|
|
+ anchor_center_x = 0.5 * (anchors[:, 2] + anchors[:, 0])
|
|
|
+ anchor_center_y = 0.5 * (anchors[:, 3] + anchors[:, 1])
|
|
|
+
|
|
|
+
|
|
|
+ decode_bbox_center_x = mbox_loc[:, 0] * anchor_width * variances[0]
|
|
|
+ decode_bbox_center_x += anchor_center_x
|
|
|
+ decode_bbox_center_y = mbox_loc[:, 1] * anchor_height * variances[0]
|
|
|
+ decode_bbox_center_y += anchor_center_y
|
|
|
+
|
|
|
+
|
|
|
+ decode_bbox_width = torch.exp(mbox_loc[:, 2] * variances[1])
|
|
|
+ decode_bbox_width *= anchor_width
|
|
|
+ decode_bbox_height = torch.exp(mbox_loc[:, 3] * variances[1])
|
|
|
+ decode_bbox_height *= anchor_height
|
|
|
+
|
|
|
+
|
|
|
+ decode_bbox_xmin = decode_bbox_center_x - 0.5 * decode_bbox_width
|
|
|
+ decode_bbox_ymin = decode_bbox_center_y - 0.5 * decode_bbox_height
|
|
|
+ decode_bbox_xmax = decode_bbox_center_x + 0.5 * decode_bbox_width
|
|
|
+ decode_bbox_ymax = decode_bbox_center_y + 0.5 * decode_bbox_height
|
|
|
+
|
|
|
+
|
|
|
+ decode_bbox = torch.cat((decode_bbox_xmin[:, None],
|
|
|
+ decode_bbox_ymin[:, None],
|
|
|
+ decode_bbox_xmax[:, None],
|
|
|
+ decode_bbox_ymax[:, None]), dim=-1)
|
|
|
+
|
|
|
+ decode_bbox = torch.min(torch.max(decode_bbox, torch.zeros_like(decode_bbox)), torch.ones_like(decode_bbox))
|
|
|
+ return decode_bbox
|
|
|
+
|
|
|
+ def decode_box(self, predictions, anchors, image_shape, input_shape, letterbox_image, variances=[0.1, 0.2],
|
|
|
+ nms_iou=0.3, confidence=0.5):
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ mbox_loc = torch.from_numpy(predictions[0])
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ mbox_conf = nn.Softmax(-1)(torch.from_numpy(predictions[1]))
|
|
|
+
|
|
|
+ results = []
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ for i in range(len(mbox_loc)):
|
|
|
+ results.append([])
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ decode_bbox = self.decode_boxes(mbox_loc[i], anchors, variances)
|
|
|
+
|
|
|
+ for c in range(1, self.num_classes):
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ c_confs = mbox_conf[i, :, c]
|
|
|
+ c_confs_m = c_confs > confidence
|
|
|
+ if len(c_confs[c_confs_m]) > 0:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ boxes_to_process = decode_bbox[c_confs_m]
|
|
|
+ confs_to_process = c_confs[c_confs_m]
|
|
|
+
|
|
|
+ keep = nms(
|
|
|
+ boxes_to_process,
|
|
|
+ confs_to_process,
|
|
|
+ nms_iou
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ good_boxes = boxes_to_process[keep]
|
|
|
+ confs = confs_to_process[keep][:, None]
|
|
|
+ labels = (c - 1) * torch.ones((len(keep), 1)).cuda() if confs.is_cuda else (c - 1) * torch.ones(
|
|
|
+ (len(keep), 1))
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ c_pred = torch.cat((good_boxes, labels, confs), dim=1).cpu().numpy()
|
|
|
+
|
|
|
+ results[-1].extend(c_pred)
|
|
|
+
|
|
|
+ if len(results[-1]) > 0:
|
|
|
+ results[-1] = np.array(results[-1])
|
|
|
+ box_xy, box_wh = (results[-1][:, 0:2] + results[-1][:, 2:4]) / 2, results[-1][:, 2:4] - results[-1][:,
|
|
|
+ 0:2]
|
|
|
+ results[-1][:, :4] = self.ssd_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image)
|
|
|
+
|
|
|
+ return results
|
|
|
+
|
|
|
+
|
|
|
+def loc2bbox(src_bbox, loc):
|
|
|
+ if src_bbox.size()[0] == 0:
|
|
|
+ return torch.zeros((0, 4), dtype=loc.dtype)
|
|
|
+
|
|
|
+ src_width = torch.unsqueeze(src_bbox[:, 2] - src_bbox[:, 0], -1)
|
|
|
+ src_height = torch.unsqueeze(src_bbox[:, 3] - src_bbox[:, 1], -1)
|
|
|
+ src_ctr_x = torch.unsqueeze(src_bbox[:, 0], -1) + 0.5 * src_width
|
|
|
+ src_ctr_y = torch.unsqueeze(src_bbox[:, 1], -1) + 0.5 * src_height
|
|
|
+
|
|
|
+ dx = loc[:, 0::4]
|
|
|
+ dy = loc[:, 1::4]
|
|
|
+ dw = loc[:, 2::4]
|
|
|
+ dh = loc[:, 3::4]
|
|
|
+
|
|
|
+ ctr_x = dx * src_width + src_ctr_x
|
|
|
+ ctr_y = dy * src_height + src_ctr_y
|
|
|
+ w = torch.exp(dw) * src_width
|
|
|
+ h = torch.exp(dh) * src_height
|
|
|
+
|
|
|
+ dst_bbox = torch.zeros_like(loc)
|
|
|
+ dst_bbox[:, 0::4] = ctr_x - 0.5 * w
|
|
|
+ dst_bbox[:, 1::4] = ctr_y - 0.5 * h
|
|
|
+ dst_bbox[:, 2::4] = ctr_x + 0.5 * w
|
|
|
+ dst_bbox[:, 3::4] = ctr_y + 0.5 * h
|
|
|
+
|
|
|
+ return dst_bbox
|
|
|
+
|
|
|
+
|
|
|
+class DecodeBox():
|
|
|
+ def __init__(self, num_classes):
|
|
|
+ self.std = torch.Tensor([0.1, 0.1, 0.2, 0.2]).repeat(num_classes + 1)[None]
|
|
|
+ self.num_classes = num_classes + 1
|
|
|
+
|
|
|
+ def frcnn_correct_boxes(self, box_xy, box_wh, input_shape, image_shape):
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ box_yx = box_xy[..., ::-1]
|
|
|
+ box_hw = box_wh[..., ::-1]
|
|
|
+ input_shape = np.array(input_shape)
|
|
|
+ image_shape = np.array(image_shape)
|
|
|
+
|
|
|
+ box_mins = box_yx - (box_hw / 2.)
|
|
|
+ box_maxes = box_yx + (box_hw / 2.)
|
|
|
+ boxes = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]],
|
|
|
+ axis=-1)
|
|
|
+ boxes *= np.concatenate([image_shape, image_shape], axis=-1)
|
|
|
+ return boxes
|
|
|
+
|
|
|
+ def forward(self, roi_cls_locs, roi_scores, rois, image_shape, input_shape, nms_iou=0.3, confidence=0.5):
|
|
|
+ roi_cls_locs = torch.from_numpy(roi_cls_locs)
|
|
|
+ roi_scores = torch.from_numpy(roi_scores)
|
|
|
+ rois = torch.from_numpy(rois)
|
|
|
+ results = []
|
|
|
+ bs = len(roi_cls_locs)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ rois = rois.view((bs, -1, 4))
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ for i in range(bs):
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ roi_cls_loc = roi_cls_locs[i] * self.std
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ roi_cls_loc = roi_cls_loc.view([-1, self.num_classes, 4])
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ roi = rois[i].view((-1, 1, 4)).expand_as(roi_cls_loc)
|
|
|
+ cls_bbox = loc2bbox(roi.contiguous().view((-1, 4)), roi_cls_loc.contiguous().view((-1, 4)))
|
|
|
+ cls_bbox = cls_bbox.view([-1, (self.num_classes), 4])
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ cls_bbox[..., [0, 2]] = (cls_bbox[..., [0, 2]]) / input_shape[1]
|
|
|
+ cls_bbox[..., [1, 3]] = (cls_bbox[..., [1, 3]]) / input_shape[0]
|
|
|
+
|
|
|
+ roi_score = roi_scores[i]
|
|
|
+ prob = F.softmax(roi_score, dim=-1)
|
|
|
+
|
|
|
+ results.append([])
|
|
|
+ for c in range(1, self.num_classes):
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ c_confs = prob[:, c]
|
|
|
+ c_confs_m = c_confs > confidence
|
|
|
+
|
|
|
+ if len(c_confs[c_confs_m]) > 0:
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ boxes_to_process = cls_bbox[c_confs_m, c]
|
|
|
+ confs_to_process = c_confs[c_confs_m]
|
|
|
+
|
|
|
+ keep = nms(
|
|
|
+ boxes_to_process,
|
|
|
+ confs_to_process,
|
|
|
+ nms_iou
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ good_boxes = boxes_to_process[keep]
|
|
|
+ confs = confs_to_process[keep][:, None]
|
|
|
+ labels = (c - 1) * torch.ones((len(keep), 1)).cuda() if confs.is_cuda else (c - 1) * torch.ones(
|
|
|
+ (len(keep), 1))
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ c_pred = torch.cat((good_boxes, confs, labels), dim=1).cpu().numpy()
|
|
|
+
|
|
|
+ results[-1].extend(c_pred)
|
|
|
+
|
|
|
+ if len(results[-1]) > 0:
|
|
|
+ results[-1] = np.array(results[-1])
|
|
|
+ box_xy, box_wh = (results[-1][:, 0:2] + results[-1][:, 2:4]) / 2, results[-1][:, 2:4] - results[-1][:,
|
|
|
+ 0:2]
|
|
|
+ results[-1][:, :4] = self.frcnn_correct_boxes(box_xy, box_wh, input_shape, image_shape)
|
|
|
+
|
|
|
+ return results
|