import torch
import torchvision
from torch import nn
import cv2
import numpy as np
import time
from collections import OrderedDict
#---network definition
def autopad(k, p=None): # kernel, padding
if p is None:
p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
return p
class Conv(nn.Module):
# Standard convolution
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
super(Conv, self).__init__()
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
self.bn = nn.BatchNorm2d(c2, eps=0.001)
self.act = nn.SiLU(inplace=True)
def forward(self, x):
y = self.act(self.bn(self.conv(x)))
return y
class Focus(nn.Module):
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
super(Focus, self).__init__()
self.conv = Conv(c1 * 4, c2, k, s, p, g, act)
def forward(self, x): # x(b,c,w,h) -> y(b,4c,w/2,h/2)
y = self.conv(torch.cat([x[:, :, ::2, ::2], x[:, :, 1::2, ::2], x[:, :, ::2, 1::2], x[:, :, 1::2, 1::2]], 1))
return y
class Bottleneck(nn.Module):
# Standard bottleneck
def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion
super(Bottleneck, self).__init__()
c_ = int(c2 * e) # hidden channels
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c_, c2, 3, 1, g=g)
self.add = shortcut and c1 == c2
def forward(self, x):
y = x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
return y
class C3(nn.Module):
# CSP Bottleneck with 3 convolutions
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
super(C3, self).__init__()
c_ = int(c2 * e) # hidden channels
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c1, c_, 1, 1)
self.cv3 = Conv(2 * c_, c2, 1) # act=FReLU(c2)
self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
def forward(self, x):
y = self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
return y
class SPP(nn.Module):
# Spatial pyramid pooling layer used in YOLOv3-SPP
def __init__(self, c1, c2, k=(5, 9, 13)):
super(SPP, self).__init__()
c_ = c1 // 2 # hidden channels
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
def forward(self, x):
x = self.cv1(x)
y = self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
return y
class Concat(nn.Module):
# Concatenate a list of tensors along dimension
def __init__(self, dimension=1):
super(Concat, self).__init__()
self.d = dimension
def forward(self, x):
y = torch.cat(x, self.d)
return y
class Detect(nn.Module):
stride = torch.tensor(data=[8., 16., 32.]) # strides computed during build
def __init__(self, nc=80, anchors=(), ch=()): # detection layer
super(Detect, self).__init__()
self.nc = nc # number of classes
self.no = nc + 5 # number of outputs per anchor 80+ x+y+w+h+cofidence
self.nl = len(anchors) # number of detection layers 最后输出3个特征图
self.na = len(anchors[0]) // 2 # number of anchors 有3个先验框
self.grid = [torch.zeros(1)] * self.nl # init grid
a = torch.tensor(anchors).float().view(self.nl, -1, 2)
self.register_buffer('anchors', a) # shape(nl,na,2)
self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2)) # shape(nl,1,na,1,1,2) 2代表宽和高
self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch) # output conv
def forward(self, x):
z = [] # inference output
for i in range(self.nl):
x[i] = self.m[i](x[i]) # conv
bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
if True: # inference
if self.grid[i].shape[2:4] != x[i].shape[2:4]: self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
y = x[i].sigmoid()
y[:, :, :, :, 0:2] = (y[:, :, :, :, 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy offset ralative to grid
y[:, :, :, :, 2:4] = (y[:, :, :, :, 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
z.append(y.view(bs, -1, self.no))
return torch.cat(z, 1)
@staticmethod
def _make_grid(nx=20, ny=20):
yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()
#---utility function
def xywh2xyxy(x):
# Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
return y
def box_iou(box1, box2):
def box_area(box):
# box = 4xn
return (box[2] - box[0]) * (box[3] - box[1])
area1 = box_area(box1.T)
area2 = box_area(box2.T)
# inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) - torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
return inter / (area1[:, None] + area2 - inter) # iou = inter / (area1 + area2 - inter)
def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False, labels=()):
nc = prediction.shape[2] - 5 # number of classes
xc = prediction[..., 4] > conf_thres # candidates
# Settings
min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height
max_det = 300 # maximum number of detections per image
max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()
time_limit = 10.0 # seconds to quit after
redundant = True # require redundant detections
multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
merge = False # use merge-NMS
t = time.time()
output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
for xi, x in enumerate(prediction): # image index, image inference
# Apply constraints
# x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height
x = x[xc[xi]] # confidence
# Cat apriori labels if autolabelling
if labels and len(labels[xi]):
l = labels[xi]
v = torch.zeros((len(l), nc + 5), device=x.device)
v[:, :4] = l[:, 1:5] # box
v[:, 4] = 1.0 # conf
v[range(len(l)), l[:, 0].long() + 5] = 1.0 # cls
x = torch.cat((x, v), 0)
# If none remain process next image
if not x.shape[0]:
continue
# Compute conf
x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
# Box (center x, center y, width, height) to (x1, y1, x2, y2)
box = xywh2xyxy(x[:, :4])
# Detections matrix nx6 (xyxy, conf, cls)
if multi_label:
i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
else: # best class only
conf, j = x[:, 5:].max(1, keepdim=True)
x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
# Filter by class
if classes is not None:
x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
# Apply finite constraint
# if not torch.isfinite(x).all():
# x = x[torch.isfinite(x).all(1)]
# Check shape
n = x.shape[0] # number of boxes
if not n: # no boxes
continue
elif n > max_nms: # excess boxes
x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence
# Batched NMS
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
if i.shape[0] > max_det: # limit detections
i = i[:max_det]
if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
# update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
weights = iou * scores[None] # box weights
x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
if redundant:
i = i[iou.sum(1) > 1] # require redundancy
output[xi] = x[i]
if (time.time() - t) > time_limit:
print(f'WARNING: NMS time limit {time_limit}s exceeded')
break # time limit exceeded
return output
def clip_coords(boxes, img_shape):
# Clip bounding xyxy bounding boxes to image shape (height, width)
boxes[:, 0].clamp_(0, img_shape[1]) # x1
boxes[:, 1].clamp_(0, img_shape[0]) # y1
boxes[:, 2].clamp_(0, img_shape[1]) # x2
boxes[:, 3].clamp_(0, img_shape[0]) # y2
def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
# Rescale coords (xyxy) from img1_shape to img0_shape
if ratio_pad is None: # calculate from img0_shape
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
else:
gain = ratio_pad[0][0]
pad = ratio_pad[1]
coords[:, [0, 2]] -= pad[0] # x padding
coords[:, [1, 3]] -= pad[1] # y padding
coords[:, :4] /= gain
clip_coords(coords, img0_shape)
return coords
def plot_one_box(x, im, color=None, label=None, line_thickness=3):
# Plots one bounding box on image 'im' using OpenCV
assert im.data.contiguous, 'Image not contiguous. Apply np.ascontiguousarray(im) to plot_on_box() input image.'
tl = line_thickness or round(0.002 * (im.shape[0] + im.shape[1]) / 2) + 1 # line/font thickness
color = color or [np.random.randint(0, 255) for _ in range(3)]
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
cv2.rectangle(im, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
if label:
tf = max(tl - 1, 1) # font thickness
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
cv2.rectangle(im, c1, c2, color, -1, cv2.LINE_AA) # filled
cv2.putText(im, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
class YoloV5(nn.Module):
def __init__(self, nc=80, anchors=None):
super(YoloV5, self).__init__()
assert anchors != None, 'anchor must be provided'
self.Focus_0 = Focus(c1=3, c2=32, k=3, p=1)
self.Conv_1 = Conv(c1=32, c2=64, k=3, s=2, p=1)
self.C3_2 = C3(c1=64, c2=64)
self.Conv_3 = Conv(c1=64, c2=128, k=3, s=2, p=1)
self.C3_4 = C3(c1=128, c2=128, n=3)
self.Conv_5 = Conv(c1=128, c2=256, k=3, s=2, p=1)
self.C3_6 = C3(c1=256, c2=256, n=3)
self.Conv_7 = Conv(c1=256, c2=512, k=3, s=2, p=1)
self.SPP_8 = SPP(c1=512, c2=512, k=(5, 9, 13))
self.C3_9 = C3(c1=512, c2=512)
self.Conv_10 = Conv(c1=512, c2=256, k=1, s=1)
self.Upsample_11 = nn.Upsample(scale_factor=2.0, mode='nearest')
self.Concat_12 = Concat(dimension=1)
self.C3_13 = C3(c1=512, c2=256, shortcut=False)
self.Conv_14 = Conv(c1=256, c2=128, k=1, s=1)
self.Upsample_15 = nn.Upsample(scale_factor=2.0, mode='nearest')
self.Concat_16 = Concat(dimension=1)
self.C3_17 = C3(c1=256, c2=128, shortcut=False)
self.Conv_18 = Conv(c1=128, c2=128, k=3, s=2, p=1)
self.Concat_19 = Concat(dimension=1)
self.C3_20 = C3(c1=256, c2=256, shortcut=False)
self.Conv_21 = Conv(c1=256, c2=256, k=3, s=2, p=1)
self.Concat_22 = Concat(dimension=1)
self.C3_23 = C3(c1=512, c2=512, shortcut=False)
self.Detect_24 = Detect(nc=nc, anchors=anchors, ch=(128, 256, 512))
def forward(self, x):
# x_0 = self.Focus_0(x)
x_0 = self.Focus_0.conv(torch.cat([x[:, :, ::2, ::2], x[:, :, 1::2, ::2], x[:, :, ::2, 1::2], x[:, :, 1::2, 1::2]], 1))
x_1 = self.Conv_1.act(self.Conv_1.bn(self.Conv_1.conv(x_0)))
x_2 = self.C3_2(x_1)
x_3 = self.Conv_3(x_2)
x_4 = self.C3_4(x_3)
x_5 = self.Conv_5(x_4)
x_6 = self.C3_6(x_5)
x_7 = self.Conv_7(x_6)
x_8 = self.SPP_8(x_7)
x_9 = self.C3_9(x_8)
x_10 = self.Conv_10(x_9)
x_11 = self.Upsample_11(x_10)
x_12 = self.Concat_12([x_11, x_6])
x_13 = self.C3_13(x_12)
x_14 = self.Conv_14(x_13)
x_15 = self.Upsample_15(x_14)
x_16 = self.Concat_16([x_15, x_4])
x_17 = self.C3_17(x_16)
x_18 = self.Conv_18(x_17)
x_19 = self.Concat_19([x_18, x_14])
x_20 = self.C3_20(x_19)
x_21 = self.Conv_21(x_20)
x_22 = self.Concat_22([x_21, x_10])
x_23 = self.C3_23(x_22)
y = self.Detect_24([x_17, x_20, x_23])
return y
#------------------------main
names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane',
'bus', 'train', 'truck', 'boat', 'traffic light',
'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird',
'cat', 'dog', 'horse', 'sheep', 'cow',
'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle',
'wine glass', 'cup', 'fork', 'knife', 'spoon',
'bowl', 'banana', 'apple', 'sandwich',
'orange', 'broccoli', 'carrot', 'hot dog',
'pizza', 'donut', 'cake', 'chair', 'couch',
'potted plant', 'bed', 'dining table', 'toilet', 'tv',
'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
'microwave', 'oven', 'toaster', 'sink', 'refrigerator',
'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
colors = [[np.random.randint(0, 255) for _ in range(3)] for _ in names]
#
anchors = [[10, 13, 16, 30, 33, 23],
[30, 61, 62, 45, 59, 119],
[116, 90, 156, 198, 373, 326]]
'''
anchors参数共有三行,每行9个数值;且每一行代表应用不同的特征图;
1、第一行是在最大的特征图上的锚框
2、第二行是在中间的特征图上的锚框
3、第三行是在最小的特征图上的锚框;'''
conf_thres = 0.25
iou_thres = 0.45
classes = None
agnostic_nms = False
hide_labels = False
hide_conf = False
line_thickness = 2
#
model = YoloV5(anchors=anchors)
# load parameter
model_pre_dict = torch.load('yolov5s.dict')
# convert key name
model_pre_key_list = list(model_pre_dict.keys())
#
model_parameter_dict = OrderedDict()
# copy parameter
for key_index, key in enumerate(model.state_dict().keys()): model_parameter_dict[key] = model_pre_dict[model_pre_key_list[key_index]]
model.load_state_dict(model_parameter_dict)
model.eval()
print('model parameter loaded')
#
image = cv2.imread('data/images/bus.jpg')
image = cv2.resize(image,(640,640))
image_init = image.copy()
image = image[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416
image = np.ascontiguousarray(image) #将image转换为具有连续内存布局的数组
image = torch.from_numpy(image) #转换为tensor 格式。
image = image.float() # uint8 to fp16/32
image /= 255.0 # 0 - 255 to 0.0 - 1.0
if image.ndimension() == 3: image = image.unsqueeze(0)
print('--------------------********************--------------------')
print('image.shape', image.shape)
with torch.no_grad(): pred = model(image)
print('pred.shape', pred.shape)
print('--------------------********************--------------------')
# Apply NMS
pred = non_max_suppression(prediction=pred, conf_thres=conf_thres, iou_thres=iou_thres, classes=classes, agnostic=agnostic_nms)
print('pred[0].shape', pred[0].shape)
det = pred[0]
print('det.shape', det.shape)
print('det = ', det)
if len(det):
# Rescale boxes from img_size to im0 size
det[:, :4] = scale_coords(image.shape[2:], det[:, :4], image_init.shape).round()
# Write results
for *xyxy, conf, cls in reversed(det):
c = int(cls) # integer class
label = None if hide_labels else (names[c] if hide_conf else f'{names[c]} {conf:.2f}')
plot_one_box(xyxy, image_init, label=label, color=colors[c], line_thickness=line_thickness)
cv2.imshow('image_init', image_init)
cv2.waitKey(0)
【目标检测】(4) Yolov5s推理代码
猜你喜欢
转载自blog.csdn.net/qq_46644680/article/details/132920936
今日推荐
周排行