SPPnet中的Spatial Pyramid Pooling Layer

SPPNet最大的优势是可以接收任意尺寸的输入。
根本原因是CNN中的卷积层可以接收任意尺寸的输入，然后生成对应尺寸的输出，但是全连接层必须要接收指定尺寸的输入如VGG-16中的fc（4096，4096）要求输入的向量必须是4096长度的。在添加了Spatial Pyramid Pool layer后，可以为全连接层提供指定长度的输入，从而解决了CNN要求固定尺寸输入这一问题。
Spatial Pyramid Pool layer的作用是产生统一尺寸的输出，并提供给后续的全连接层，具体实现如下：


import torch.nn as nn
import torch
from torch.nn import Conv2d, MaxPool2d, Linear
import torch.nn.functional as F
import math


def spp_net(conv_kernel, sample_num, conv_kernel_size, output_size):
    for i in range(len(output_size)):
        h, w = conv_kernel_size
        win_h = math.ceil(h / output_size[i])
        win_w = math.ceil(w / output_size[i])
        str_h = math.floor(h / output_size[i])
        str_w = math.floor(w / output_size[i])
        # print('win_h ,w, str_h ,w', win_h, win_w, str_h, str_w)
        max_pool = MaxPool2d(kernel_size=(win_h, win_w), stride=(str_h, str_w))
        x = max_pool(conv_kernel)
        # print(x.shape)
        if i == 0:
            spp = x.view(sample_num, -1)  # view(1,-1)实现flatten的效果  把数组拉伸成一条
        else:
            spp = torch.cat((spp, x.view(sample_num, -1)), 1)  # cat把两个数组拼在一起  dim=1 横着拼  dim=0 竖着拼
        # print('spp.shape', spp.shape)
    return spp


class SPPCLASS(nn.modules.Module):

        def __init__(self, init_weights=True):
        super(SPPCLASS, self).__init__()

		self.output_num = [4, 2, 1]  # 这个值直接影响第一个fc层的输入尺寸

        self.conv1 = nn.Conv2d(3, 96, kernel_size=7, stride=2)
        self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2)

        self.conv2 = nn.Conv2d(96, 256, kernel_size=5, stride=2)
        self.pool2 = nn.MaxPool2d(kernel_size=3, stride=2)

        self.conv3 = nn.Conv2d(256, 384, kernel_size=3)
        self.conv4 = nn.Conv2d(384, 384, kernel_size=3)
        self.conv5 = nn.Conv2d(384, 256, kernel_size=3)

        self.fc1 = nn.Linear(sum([i * i for i in self.output_num]) * 256, 4096)  
        self.fc2 = nn.Linear(4096, 4096)

        self.out = nn.Linear(4096, 37)  # 自己的数据集分为多少类就把37替换为多少

        if init_weights:
            self._initialize_weights()  # 对model的参数进行初始化

    def forward(self, x):
    
        x = F.relu(self.conv1(x))
        x = F.local_response_norm(x, size=4)
        x = self.pool1(x)

        x = F.relu(self.conv2(x))
        x = F.local_response_norm(x, size=4)
        x = self.pool2(x)

        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv5(x))
		#此时x.size()的值类似于[1, 512, 6, 10]
        spp = spatial_pyramid_pool(x, x.size(0), [int(x.size(2)), int(x.size(3))], self.output_num)

        f1 = self.fc1(spp)
        s = nn.Dropout(p=0.5)(f1)
        fc1 = F.relu(s)
        fc2 = F.relu(self.fc2(fc1))

        output = self.out(fc2)
        return output

    def _initialize_weights(self):  # 对model的参数进行初始化
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight.data)
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

SPPnet中的Spatial Pyramid Pooling Layer

猜你喜欢