摘要
在计算机视觉任务中,语义分割和关键点检测是具有挑战的,因为既要尽量保持高分辨率、又要考虑计算量,以及尽量连接全局信息得到结果。由于提取全局信息的有效性以及对全局像素special的注意力,attention机制在计算机视觉领域变得非常的流行。CNN本身是卷积共享的核,且具有平移不变性,也就代表其是local且丧失special注意力的。但是long-range的attention参数量大,训练耗时长且难以收敛;很多学者都在研究轻便易于训练的attention机制。本文提出了一种极化注意力机制,其优势在于:1.使用正交的方式,保证了低参数量的同时,保证了高通道分辨率和高空间分辨率;2.在注意力机制中加入非线性,使得拟合的输出更具有细腻度(更加贴近真实输出);
简介
在语义分割和关键点检测中,大多数采用了上采样-下采样的骨干网络,下采样的时候减小特征图大小增大通道数,上采样的时候增大特征图减小通道数,之所以需要上采样-下采样的结构,是为了节省计算和激活值开销,同时加深网络深度;较高的分辨率的输入势必能提高准确率,对于分割这类细粒度的任务而言,下采样势必会降低精度;所以对于分割这类细粒度的任务,其挑战主要在于:1.在合理的开销的同时尽量保证高分辨率;2.尽量让输出去拟合到真实的分布;
有一些仅基于通道注意力的机制(SE , GE and GCNet ),其用在分类任务上是能够提升准确率的,因为对于分类而言,网络本身就要基于全局信息下采样,就已经在空间全局信息上做了融合,所以仅通道注意力是有效的,然后对于目标检测而言,基于通道注意力突出了全部前景的像素,所以仅通道注意力可以提升其准确率;
本文提出方案的重点在于:1.PSA模块内部保持高分辨率;2.在通道分支和空间分支加入softmax和sigmoid的混合增加非线性,去拟合更真实更有细腻度的输出分布;
对于像素回归的任务:我们先来说说高分辨率对于像素级的任务来说有多重要,HRNet以高分辨率的传输在像素级任务上去的了比Resnet更好的效果,以及Deeplab中的空洞卷积也是为了提高分辨率,抛开网络结构,输入600600的分辨率一般会比500500准确率更高;因为意识到高分辨率对于像素级任务的影响,所以很多学者都在研究如何在合理开销下提升分辨率;本文则是从attention模块的角度提升分辨率;
attention机制及其变种:
attention机制的产生是为了解决传统标准卷积存在的问题;attention机制由输入Tensor生成attention Tensor,以相乘方式叠加在输入Tensor上,它能有效利用全局的信息并给予不同的注意力;它一般都是放在卷积后面,模块具有很好的移植性,可以方便地添加在不同的模型当中;
全矩阵及简化的attention模块:
非局部模块(NL)提出了全矩阵attention机制并取得了成功;在q,v之间求取一个相似性矩阵会占用大量内存和计算开销,所有有很多学者在这方便进行了改进,有的仅基于通道对空间像素进行了re-weighted;
Our method
回归像素级任务的深度卷积网络从以下两个维度学习weighted的特征图:1.从通道角度要尽可能突出像素所属分类;2.从空间角度尽可能检测出属于同一语义的像素位置;以上两点就是attention机制的目的;
我们假设输入特征图是X,A是attention block,Z是经过attention变换后的输出。
为了方便解释,我们假设Cin=Cout;
以上,
Z=A(X)*X
在传统的非局部自注意力机制中:
我们方法是如下图所示:
PSA的具体实现代码如下图所示:
import torch
import torch.nn as nn
import torch._utils
import torch.nn.functional as F
def constant_init(module, val, bias=0):
if hasattr(module, 'weight') and module.weight is not None:
nn.init.constant_(module.weight, val)
if hasattr(module, 'bias') and module.bias is not None:
nn.init.constant_(module.bias, bias)
def kaiming_init(module,
a=0,
mode='fan_out',
nonlinearity='relu',
bias=0,
distribution='normal'):
assert distribution in ['uniform', 'normal']
if distribution == 'uniform':
nn.init.kaiming_uniform_(
module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
else:
nn.init.kaiming_normal_(
module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
if hasattr(module, 'bias') and module.bias is not None:
nn.init.constant_(module.bias, bias)
class PSA_p(nn.Module):
def __init__(self, inplanes, planes, kernel_size=1, stride=1):
super(PSA_p, self).__init__()
self.inplanes = inplanes
self.inter_planes = planes // 2
self.planes = planes
self.kernel_size = kernel_size
self.stride = stride
self.padding = (kernel_size-1)//2
self.conv_q_right = nn.Conv2d(self.inplanes, 1, kernel_size=1, stride=stride, padding=0, bias=False)
self.conv_v_right = nn.Conv2d(self.inplanes, self.inter_planes, kernel_size=1, stride=stride, padding=0, bias=False)
self.conv_up = nn.Conv2d(self.inter_planes, self.planes, kernel_size=1, stride=1, padding=0, bias=False)
self.softmax_right = nn.Softmax(dim=2)
self.sigmoid = nn.Sigmoid()
self.conv_q_left = nn.Conv2d(self.inplanes, self.inter_planes, kernel_size=1, stride=stride, padding=0, bias=False) #g
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.conv_v_left = nn.Conv2d(self.inplanes, self.inter_planes, kernel_size=1, stride=stride, padding=0, bias=False) #theta
self.softmax_left = nn.Softmax(dim=2)
self.reset_parameters()
def reset_parameters(self):
kaiming_init(self.conv_q_right, mode='fan_in')
kaiming_init(self.conv_v_right, mode='fan_in')
kaiming_init(self.conv_q_left, mode='fan_in')
kaiming_init(self.conv_v_left, mode='fan_in')
self.conv_q_right.inited = True
self.conv_v_right.inited = True
self.conv_q_left.inited = True
self.conv_v_left.inited = True
def spatial_pool(self, x):
input_x = self.conv_v_right(x)
batch, channel, height, width = input_x.size()
# [N, IC, H*W]
input_x = input_x.view(batch, channel, height * width)
# [N, 1, H, W]
context_mask = self.conv_q_right(x)
# [N, 1, H*W]
context_mask = context_mask.view(batch, 1, height * width)
# [N, 1, H*W]
context_mask = self.softmax_right(context_mask)
# [N, IC, 1]
# context = torch.einsum('ndw,new->nde', input_x, context_mask)
context = torch.matmul(input_x, context_mask.transpose(1,2))
# [N, IC, 1, 1]
context = context.unsqueeze(-1)
# [N, OC, 1, 1]
context = self.conv_up(context)
# [N, OC, 1, 1]
mask_ch = self.sigmoid(context)
out = x * mask_ch
return out
def channel_pool(self, x):
# [N, IC, H, W]
g_x = self.conv_q_left(x)
batch, channel, height, width = g_x.size()
# [N, IC, 1, 1]
avg_x = self.avg_pool(g_x)
batch, channel, avg_x_h, avg_x_w = avg_x.size()
# [N, 1, IC]
avg_x = avg_x.view(batch, channel, avg_x_h * avg_x_w).permute(0, 2, 1)
# [N, IC, H*W]
theta_x = self.conv_v_left(x).view(batch, self.inter_planes, height * width)
# [N, 1, H*W]
# context = torch.einsum('nde,new->ndw', avg_x, theta_x)
context = torch.matmul(avg_x, theta_x)
# [N, 1, H*W]
context = self.softmax_left(context)
# [N, 1, H, W]
context = context.view(batch, 1, height, width)
# [N, 1, H, W]
mask_sp = self.sigmoid(context)
out = x * mask_sp
return out
def forward(self, x):
# [N, C, H, W]
context_channel = self.spatial_pool(x)
# [N, C, H, W]
context_spatial = self.channel_pool(x)
# [N, C, H, W]
out = context_spatial + context_channel
return out
#PSA模块究竟是做咩的
class PSA_s(nn.Module):
def __init__(self, inplanes, planes, kernel_size=1, stride=1):
super(PSA_s, self).__init__()
self.inplanes = inplanes#输入通道数
self.inter_planes = planes // 2#中间通道数
self.planes = planes#输出通道数
self.kernel_size = kernel_size
self.stride = stride
self.padding = (kernel_size - 1) // 2
ratio = 4
#1*1的卷积把通道数降为1
self.conv_q_right = nn.Conv2d(self.inplanes, 1, kernel_size=1, stride=stride, padding=0, bias=False)
#1*1的卷积变换通道到中间通道数
self.conv_v_right = nn.Conv2d(self.inplanes, self.inter_planes, kernel_size=1, stride=stride, padding=0,
bias=False)
# self.conv_up = nn.Conv2d(self.inter_planes, self.planes, kernel_size=1, stride=1, padding=0, bias=False)
#分两次1*1的卷积,一次先变成输入通道的1/4,第二次卷积到输出通道数
self.conv_up = nn.Sequential(
nn.Conv2d(self.inter_planes, self.inter_planes // ratio, kernel_size=1),
nn.LayerNorm([self.inter_planes // ratio, 1, 1]),
nn.ReLU(inplace=True),
nn.Conv2d(self.inter_planes // ratio, self.planes, kernel_size=1)
)
#在dim=2通道层面进行softmax
self.softmax_right = nn.Softmax(dim=2)
self.sigmoid = nn.Sigmoid()
self.conv_q_left = nn.Conv2d(self.inplanes, self.inter_planes, kernel_size=1, stride=stride, padding=0,
bias=False) # g
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.conv_v_left = nn.Conv2d(self.inplanes, self.inter_planes, kernel_size=1, stride=stride, padding=0,
bias=False) # theta
self.softmax_left = nn.Softmax(dim=2)
self.reset_parameters()
def reset_parameters(self):
kaiming_init(self.conv_q_right, mode='fan_in')
kaiming_init(self.conv_v_right, mode='fan_in')
kaiming_init(self.conv_q_left, mode='fan_in')
kaiming_init(self.conv_v_left, mode='fan_in')
self.conv_q_right.inited = True
self.conv_v_right.inited = True
self.conv_q_left.inited = True
self.conv_v_left.inited = True
#空间下采样 结合空间像素的值获取通道注意力
def spatial_pool(self, x):
##1*1的卷积变换通道到中间通道数
input_x = self.conv_v_right(x)
batch, channel, height, width = input_x.size()
# [N, IC, H*W]
input_x = input_x.view(batch, channel, height * width)
# [N, 1, H, W] #用1*1的卷积把通道数降为1
context_mask = self.conv_q_right(x)
# [N, 1, H*W]
context_mask = context_mask.view(batch, 1, height * width)
# [N, 1, H*W] #在dim=2层面进行softmax
context_mask = self.softmax_right(context_mask)
# [N, IC, 1]
# context = torch.einsum('ndw,new->nde', input_x, context_mask)
context = torch.matmul(input_x, context_mask.transpose(1, 2))
# [N, IC, 1, 1]
context = context.unsqueeze(-1)
# [N, OC, 1, 1]#分两次1*1的卷积,一次先变成输入通道的1/4,第二次卷积到输出通道数
context = self.conv_up(context)
# [N, OC, 1, 1]#进行sigmoid
mask_ch = self.sigmoid(context)
out = x * mask_ch
return out
#用通道给全局像素进行注意力加权
def channel_pool(self, x):
# [N, IC, H, W] 用1*1卷积将通道变为inter planes
g_x = self.conv_q_left(x)
batch, channel, height, width = g_x.size()
# [N, IC, 1, 1] 全局平均池化
avg_x = self.avg_pool(g_x)
batch, channel, avg_x_h, avg_x_w = avg_x.size()
# [N, 1, IC]
avg_x = avg_x.view(batch, channel, avg_x_h * avg_x_w).permute(0, 2, 1)
# [N, IC, H*W]
theta_x = self.conv_v_left(x).view(batch, self.inter_planes, height * width)
# [N, IC, H*W]
theta_x = self.softmax_left(theta_x)
# [N, 1, H*W]
# context = torch.einsum('nde,new->ndw', avg_x, theta_x)
context = torch.matmul(avg_x, theta_x)
# [N, 1, H, W]
context = context.view(batch, 1, height, width)
# [N, 1, H, W]
mask_sp = self.sigmoid(context)
out = x * mask_sp
return out
def forward(self, x):
# [N, C, H, W]
out = self.spatial_pool(x)
# [N, C, H, W]
out = self.channel_pool(out)
# [N, C, H, W]
# out = context_spatial + context_channel
return out
然后把PSA放在一个基本的BasicBlock(残差模块)中间即可:
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = BatchNorm2d(planes, momentum=BN_MOMENTUM)
self.relu = nn.ReLU(inplace=True)
self.deattn = PSA_s(planes, planes)
self.conv2 = conv3x3(planes, planes)
self.bn2 = BatchNorm2d(planes, momentum=BN_MOMENTUM)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.deattn(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out = out + residual
out = self.relu(out)
return out