SPPNet最大的优势是可以接收任意尺寸的输入。
根本原因是CNN中的卷积层可以接收任意尺寸的输入,然后生成对应尺寸的输出,但是全连接层必须要接收指定尺寸的输入 如VGG-16中的fc(4096,4096)要求输入的向量必须是4096长度的。在添加了Spatial Pyramid Pool layer后,可以为全连接层提供指定长度的输入,从而解决了CNN要求固定尺寸输入这一问题。
Spatial Pyramid Pool layer的作用是产生统一尺寸的输出,并提供给后续的全连接层,具体实现如下:
import torch.nn as nn
import torch
from torch.nn import Conv2d, MaxPool2d, Linear
import torch.nn.functional as F
import math
def spp_net(conv_kernel, sample_num, conv_kernel_size, output_size):
for i in range(len(output_size)):
h, w = conv_kernel_size
win_h = math.ceil(h / output_size[i])
win_w = math.ceil(w / output_size[i])
str_h = math.floor(h / output_size[i])
str_w = math.floor(w / output_size[i])
# print('win_h ,w, str_h ,w', win_h, win_w, str_h, str_w)
max_pool = MaxPool2d(kernel_size=(win_h, win_w), stride=(str_h, str_w))
x = max_pool(conv_kernel)
# print(x.shape)
if i == 0:
spp = x.view(sample_num, -1) # view(1,-1)实现flatten的效果 把数组拉伸成一条
else:
spp = torch.cat((spp, x.view(sample_num, -1)), 1) # cat把两个数组拼在一起 dim=1 横着拼 dim=0 竖着拼
# print('spp.shape', spp.shape)
return spp
class SPPCLASS(nn.modules.Module):
def __init__(self, init_weights=True):
super(SPPCLASS, self).__init__()
self.output_num = [4, 2, 1] # 这个值直接影响第一个fc层的输入尺寸
self.conv1 = nn.Conv2d(3, 96, kernel_size=7, stride=2)
self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2)
self.conv2 = nn.Conv2d(96, 256, kernel_size=5, stride=2)
self.pool2 = nn.MaxPool2d(kernel_size=3, stride=2)
self.conv3 = nn.Conv2d(256, 384, kernel_size=3)
self.conv4 = nn.Conv2d(384, 384, kernel_size=3)
self.conv5 = nn.Conv2d(384, 256, kernel_size=3)
self.fc1 = nn.Linear(sum([i * i for i in self.output_num]) * 256, 4096)
self.fc2 = nn.Linear(4096, 4096)
self.out = nn.Linear(4096, 37) # 自己的数据集分为多少类就把37替换为多少
if init_weights:
self._initialize_weights() # 对model的参数进行初始化
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.local_response_norm(x, size=4)
x = self.pool1(x)
x = F.relu(self.conv2(x))
x = F.local_response_norm(x, size=4)
x = self.pool2(x)
x = F.relu(self.conv3(x))
x = F.relu(self.conv4(x))
x = F.relu(self.conv5(x))
#此时x.size()的值类似于[1, 512, 6, 10]
spp = spatial_pyramid_pool(x, x.size(0), [int(x.size(2)), int(x.size(3))], self.output_num)
f1 = self.fc1(spp)
s = nn.Dropout(p=0.5)(f1)
fc1 = F.relu(s)
fc2 = F.relu(self.fc2(fc1))
output = self.out(fc2)
return output
def _initialize_weights(self): # 对model的参数进行初始化
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight.data)
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
m.weight.data.normal_(0, 0.01)
m.bias.data.zero_()