utils_bbox.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. import numpy as np
  2. import torch
  3. from torch import nn
  4. from torch.nn import functional as F
  5. from torchvision.ops import nms
  6. class BBoxUtility(object):
  7. def __init__(self, num_classes):
  8. self.num_classes = num_classes
  9. def ssd_correct_boxes(self, box_xy, box_wh, input_shape, image_shape, letterbox_image):
  10. # -----------------------------------------------------------------#
  11. # 把y轴放前面是因为方便预测框和图像的宽高进行相乘
  12. # -----------------------------------------------------------------#
  13. box_yx = box_xy[..., ::-1]
  14. box_hw = box_wh[..., ::-1]
  15. input_shape = np.array(input_shape)
  16. image_shape = np.array(image_shape)
  17. if letterbox_image:
  18. # -----------------------------------------------------------------#
  19. # 这里求出来的offset是图像有效区域相对于图像左上角的偏移情况
  20. # new_shape指的是宽高缩放情况
  21. # -----------------------------------------------------------------#
  22. new_shape = np.round(image_shape * np.min(input_shape / image_shape))
  23. offset = (input_shape - new_shape) / 2. / input_shape
  24. scale = input_shape / new_shape
  25. box_yx = (box_yx - offset) * scale
  26. box_hw *= scale
  27. box_mins = box_yx - (box_hw / 2.)
  28. box_maxes = box_yx + (box_hw / 2.)
  29. boxes = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]],
  30. axis=-1)
  31. boxes *= np.concatenate([image_shape, image_shape], axis=-1)
  32. return boxes
  33. def decode_boxes(self, mbox_loc, anchors, variances):
  34. # 获得先验框的宽与高
  35. anchor_width = anchors[:, 2] - anchors[:, 0]
  36. anchor_height = anchors[:, 3] - anchors[:, 1]
  37. # 获得先验框的中心点
  38. anchor_center_x = 0.5 * (anchors[:, 2] + anchors[:, 0])
  39. anchor_center_y = 0.5 * (anchors[:, 3] + anchors[:, 1])
  40. # 真实框距离先验框中心的xy轴偏移情况
  41. decode_bbox_center_x = mbox_loc[:, 0] * anchor_width * variances[0]
  42. decode_bbox_center_x += anchor_center_x
  43. decode_bbox_center_y = mbox_loc[:, 1] * anchor_height * variances[0]
  44. decode_bbox_center_y += anchor_center_y
  45. # 真实框的宽与高的求取
  46. decode_bbox_width = torch.exp(mbox_loc[:, 2] * variances[1])
  47. decode_bbox_width *= anchor_width
  48. decode_bbox_height = torch.exp(mbox_loc[:, 3] * variances[1])
  49. decode_bbox_height *= anchor_height
  50. # 获取真实框的左上角与右下角
  51. decode_bbox_xmin = decode_bbox_center_x - 0.5 * decode_bbox_width
  52. decode_bbox_ymin = decode_bbox_center_y - 0.5 * decode_bbox_height
  53. decode_bbox_xmax = decode_bbox_center_x + 0.5 * decode_bbox_width
  54. decode_bbox_ymax = decode_bbox_center_y + 0.5 * decode_bbox_height
  55. # 真实框的左上角与右下角进行堆叠
  56. decode_bbox = torch.cat((decode_bbox_xmin[:, None],
  57. decode_bbox_ymin[:, None],
  58. decode_bbox_xmax[:, None],
  59. decode_bbox_ymax[:, None]), dim=-1)
  60. # 防止超出0与1
  61. decode_bbox = torch.min(torch.max(decode_bbox, torch.zeros_like(decode_bbox)), torch.ones_like(decode_bbox))
  62. return decode_bbox
  63. def decode_box(self, predictions, anchors, image_shape, input_shape, letterbox_image, variances=[0.1, 0.2],
  64. nms_iou=0.3, confidence=0.5):
  65. # ---------------------------------------------------#
  66. # :4是回归预测结果
  67. # ---------------------------------------------------#
  68. mbox_loc = torch.from_numpy(predictions[0])
  69. # ---------------------------------------------------#
  70. # 获得种类的置信度
  71. # ---------------------------------------------------#
  72. mbox_conf = nn.Softmax(-1)(torch.from_numpy(predictions[1]))
  73. results = []
  74. # ----------------------------------------------------------------------------------------------------------------#
  75. # 对每一张图片进行处理,由于在predict.py的时候,我们只输入一张图片,所以for i in range(len(mbox_loc))只进行一次
  76. # ----------------------------------------------------------------------------------------------------------------#
  77. for i in range(len(mbox_loc)):
  78. results.append([])
  79. # --------------------------------#
  80. # 利用回归结果对先验框进行解码
  81. # --------------------------------#
  82. decode_bbox = self.decode_boxes(mbox_loc[i], anchors, variances)
  83. for c in range(1, self.num_classes):
  84. # --------------------------------#
  85. # 取出属于该类的所有框的置信度
  86. # 判断是否大于门限
  87. # --------------------------------#
  88. c_confs = mbox_conf[i, :, c]
  89. c_confs_m = c_confs > confidence
  90. if len(c_confs[c_confs_m]) > 0:
  91. # -----------------------------------------#
  92. # 取出得分高于confidence的框
  93. # -----------------------------------------#
  94. boxes_to_process = decode_bbox[c_confs_m]
  95. confs_to_process = c_confs[c_confs_m]
  96. keep = nms(
  97. boxes_to_process,
  98. confs_to_process,
  99. nms_iou
  100. )
  101. # -----------------------------------------#
  102. # 取出在非极大抑制中效果较好的内容
  103. # -----------------------------------------#
  104. good_boxes = boxes_to_process[keep]
  105. confs = confs_to_process[keep][:, None]
  106. labels = (c - 1) * torch.ones((len(keep), 1)).cuda() if confs.is_cuda else (c - 1) * torch.ones(
  107. (len(keep), 1))
  108. # -----------------------------------------#
  109. # 将label、置信度、框的位置进行堆叠。
  110. # -----------------------------------------#
  111. c_pred = torch.cat((good_boxes, labels, confs), dim=1).cpu().numpy()
  112. # 添加进result里
  113. results[-1].extend(c_pred)
  114. if len(results[-1]) > 0:
  115. results[-1] = np.array(results[-1])
  116. box_xy, box_wh = (results[-1][:, 0:2] + results[-1][:, 2:4]) / 2, results[-1][:, 2:4] - results[-1][:,
  117. 0:2]
  118. results[-1][:, :4] = self.ssd_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image)
  119. return results
  120. def loc2bbox(src_bbox, loc):
  121. if src_bbox.size()[0] == 0:
  122. return torch.zeros((0, 4), dtype=loc.dtype)
  123. src_width = torch.unsqueeze(src_bbox[:, 2] - src_bbox[:, 0], -1)
  124. src_height = torch.unsqueeze(src_bbox[:, 3] - src_bbox[:, 1], -1)
  125. src_ctr_x = torch.unsqueeze(src_bbox[:, 0], -1) + 0.5 * src_width
  126. src_ctr_y = torch.unsqueeze(src_bbox[:, 1], -1) + 0.5 * src_height
  127. dx = loc[:, 0::4]
  128. dy = loc[:, 1::4]
  129. dw = loc[:, 2::4]
  130. dh = loc[:, 3::4]
  131. ctr_x = dx * src_width + src_ctr_x
  132. ctr_y = dy * src_height + src_ctr_y
  133. w = torch.exp(dw) * src_width
  134. h = torch.exp(dh) * src_height
  135. dst_bbox = torch.zeros_like(loc)
  136. dst_bbox[:, 0::4] = ctr_x - 0.5 * w
  137. dst_bbox[:, 1::4] = ctr_y - 0.5 * h
  138. dst_bbox[:, 2::4] = ctr_x + 0.5 * w
  139. dst_bbox[:, 3::4] = ctr_y + 0.5 * h
  140. return dst_bbox
  141. class DecodeBox():
  142. def __init__(self, num_classes):
  143. self.std = torch.Tensor([0.1, 0.1, 0.2, 0.2]).repeat(num_classes + 1)[None]
  144. self.num_classes = num_classes + 1
  145. def frcnn_correct_boxes(self, box_xy, box_wh, input_shape, image_shape):
  146. # -----------------------------------------------------------------#
  147. # 把y轴放前面是因为方便预测框和图像的宽高进行相乘
  148. # -----------------------------------------------------------------#
  149. box_yx = box_xy[..., ::-1]
  150. box_hw = box_wh[..., ::-1]
  151. input_shape = np.array(input_shape)
  152. image_shape = np.array(image_shape)
  153. box_mins = box_yx - (box_hw / 2.)
  154. box_maxes = box_yx + (box_hw / 2.)
  155. boxes = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]],
  156. axis=-1)
  157. boxes *= np.concatenate([image_shape, image_shape], axis=-1)
  158. return boxes
  159. def forward(self, roi_cls_locs, roi_scores, rois, image_shape, input_shape, nms_iou=0.3, confidence=0.5):
  160. roi_cls_locs = torch.from_numpy(roi_cls_locs)
  161. roi_scores = torch.from_numpy(roi_scores)
  162. rois = torch.from_numpy(rois)
  163. results = []
  164. bs = len(roi_cls_locs)
  165. # --------------------------------#
  166. # batch_size, num_rois, 4
  167. # --------------------------------#
  168. rois = rois.view((bs, -1, 4))
  169. # ----------------------------------------------------------------------------------------------------------------#
  170. # 对每一张图片进行处理,由于在predict.py的时候,我们只输入一张图片,所以for i in range(len(mbox_loc))只进行一次
  171. # ----------------------------------------------------------------------------------------------------------------#
  172. for i in range(bs):
  173. # ----------------------------------------------------------#
  174. # 对回归参数进行reshape
  175. # ----------------------------------------------------------#
  176. roi_cls_loc = roi_cls_locs[i] * self.std
  177. # ----------------------------------------------------------#
  178. # 第一维度是建议框的数量,第二维度是每个种类
  179. # 第三维度是对应种类的调整参数
  180. # ----------------------------------------------------------#
  181. roi_cls_loc = roi_cls_loc.view([-1, self.num_classes, 4])
  182. # -------------------------------------------------------------#
  183. # 利用classifier网络的预测结果对建议框进行调整获得预测框
  184. # num_rois, 4 -> num_rois, 1, 4 -> num_rois, num_classes, 4
  185. # -------------------------------------------------------------#
  186. roi = rois[i].view((-1, 1, 4)).expand_as(roi_cls_loc)
  187. cls_bbox = loc2bbox(roi.contiguous().view((-1, 4)), roi_cls_loc.contiguous().view((-1, 4)))
  188. cls_bbox = cls_bbox.view([-1, (self.num_classes), 4])
  189. # -------------------------------------------------------------#
  190. # 对预测框进行归一化,调整到0-1之间
  191. # -------------------------------------------------------------#
  192. cls_bbox[..., [0, 2]] = (cls_bbox[..., [0, 2]]) / input_shape[1]
  193. cls_bbox[..., [1, 3]] = (cls_bbox[..., [1, 3]]) / input_shape[0]
  194. roi_score = roi_scores[i]
  195. prob = F.softmax(roi_score, dim=-1)
  196. results.append([])
  197. for c in range(1, self.num_classes):
  198. # --------------------------------#
  199. # 取出属于该类的所有框的置信度
  200. # 判断是否大于门限
  201. # --------------------------------#
  202. c_confs = prob[:, c]
  203. c_confs_m = c_confs > confidence
  204. if len(c_confs[c_confs_m]) > 0:
  205. # -----------------------------------------#
  206. # 取出得分高于confidence的框
  207. # -----------------------------------------#
  208. boxes_to_process = cls_bbox[c_confs_m, c]
  209. confs_to_process = c_confs[c_confs_m]
  210. keep = nms(
  211. boxes_to_process,
  212. confs_to_process,
  213. nms_iou
  214. )
  215. # -----------------------------------------#
  216. # 取出在非极大抑制中效果较好的内容
  217. # -----------------------------------------#
  218. good_boxes = boxes_to_process[keep]
  219. confs = confs_to_process[keep][:, None]
  220. labels = (c - 1) * torch.ones((len(keep), 1)).cuda() if confs.is_cuda else (c - 1) * torch.ones(
  221. (len(keep), 1))
  222. # -----------------------------------------#
  223. # 将label、置信度、框的位置进行堆叠。
  224. # -----------------------------------------#
  225. c_pred = torch.cat((good_boxes, confs, labels), dim=1).cpu().numpy()
  226. # 添加进result里
  227. results[-1].extend(c_pred)
  228. if len(results[-1]) > 0:
  229. results[-1] = np.array(results[-1])
  230. box_xy, box_wh = (results[-1][:, 0:2] + results[-1][:, 2:4]) / 2, results[-1][:, 2:4] - results[-1][:,
  231. 0:2]
  232. results[-1][:, :4] = self.frcnn_correct_boxes(box_xy, box_wh, input_shape, image_shape)
  233. return results