I recently switched from tensorflow to pytorch, and today I encountered a big pit: RuntimeError: mat1 and mat2 shapes cannot be multiplied , most of the results on the Internet are vague and ambiguous. After carefully analyzing the implementation mode of the network structure, I finally understood this problem .
The data set used this time is CIFAR10, about 130M. There are 50,000 training images and 10,000 test images, each with a size of 32*32, and all images are divided into ten categories, which is very suitable as a data set for classification targets.
Article Directory
1. Import the CIFAR10 dataset
Note that I set batch_size to 1 here, and I will use it below
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch import nn,optim,device
from torch.utils.tensorboard import SummaryWriter
device=device("cuda")
train_transform = transforms.Compose([
transforms.ToTensor(),
# 归一化
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
valid_transform=transforms.Compose([
transforms.ToTensor(),
# 归一化
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
train_dataset =torchvision.datasets.CIFAR10(root=r'D:\AAA\PYTHON\pythonproject\jupyter\data\CIFAR10',train=True,transform=train_transform,download=False)
valid_dataset =torchvision.datasets.CIFAR10(root=r'D:\AAA\PYTHON\pythonproject\jupyter\data\CIFAR10',train=False,transform=valid_transform,download=False)
batch_size=1
train_loader =DataLoader(train_dataset,batch_size=batch_size, shuffle=True,num_workers=0)
valid_loader =DataLoader(valid_dataset,batch_size=batch_size, shuffle=True,num_workers=0)
print('train_dataset',len(train_dataset)) #50000
print('valid_dataset',len(valid_dataset)) #10000
2. Understand the relationship between image size and VGG16 network structure
First of all, this problem occurs when the model object is input. The reason is that the shape of a pair of tensors cannot be multiplied when the model enters the fully connected layer. To solve this problem, it must be clear that the size of the input image is in each The details of the input and output of the layer, or from the two most classic vgg16 diagrams: each
row of column D in Figure 1 describes the convolution kernel size and output channel of each layer of vgg16 we want to use , for example, conv3-64
this The convolution kernel size of layer convolution is (3*3), and the number of output channels is 64
Figure 2 depicts the output size and number of channels of each layer operation of vgg16 224 X 224
that . For example 112 X 112 X 128
, the output size of this layer is 112 X 112
, and the number of output channels is 128
. The process of our 32 X 32
size data set is those annotated with red pen in Figure 2. Note that when the image enters the pooling layer, the tensor size has been 1 X 1
convoluted equivalent to no pooling, so other bloggers may omit it. Do not write.
In summary, we can build vgg16 manually:
from torch import nn
class vgg16_net(nn.Module):
def __init__(self):
super(vgg16_net,self).__init__()
# 卷积层
self.features=nn.Sequential(
# block1
nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
# block2
nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
# block3
nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
# block4
nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
# block5
nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
# 平均池化
self.avgpool=nn.AdaptiveAvgPool2d(output_size=(1, 1))
# 全连接层
self.classifier=nn.Sequential(
nn.Linear(in_features=1*1*512, out_features=512, bias=True),
nn.ReLU(inplace=True),
nn.Dropout(p=0.4, inplace=False),
nn.Linear(in_features=512, out_features=256, bias=True),
nn.ReLU(inplace=True),
nn.Dropout(p=0.4, inplace=False),
nn.Linear(in_features=256, out_features=10, bias=True)
)
def forward(self,input):
input=self.features(input)
input=self.avgpool(input)
# input=input.view(-1,512)
input=self.classifier(input)
return input
model=vgg16_net()
model=model.to(device) #调用GPU
print(model)
The model structure at this time is:
vgg16_net(
(features): Sequential(
(0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): ReLU(inplace=True)
(2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(3): ReLU(inplace=True)
(4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(6): ReLU(inplace=True)
(7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(8): ReLU(inplace=True)
(9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(11): ReLU(inplace=True)
(12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(13): ReLU(inplace=True)
(14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(15): ReLU(inplace=True)
(16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(18): ReLU(inplace=True)
(19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(20): ReLU(inplace=True)
(21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(22): ReLU(inplace=True)
(23): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(25): ReLU(inplace=True)
(26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(27): ReLU(inplace=True)
(28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(29): ReLU(inplace=True)
(30): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(avgpool): AdaptiveAvgPool2d(output_size=(1, 1))
(classifier): Sequential(
(0): Linear(in_features=512, out_features=512, bias=True)
(1): ReLU(inplace=True)
(2): Dropout(p=0.4, inplace=False)
(3): Linear(in_features=512, out_features=256, bias=True)
(4): ReLU(inplace=True)
(5): Dropout(p=0.4, inplace=False)
(6): Linear(in_features=256, out_features=10, bias=True)
)
)
The ultimate detailed explanation: RuntimeError: mat1 and mat2 shapes cannot be multiplied problem
Add a small test section to see
from torch import ones
input=ones((batch_size,3,32,32))
input=input.to(device) #调用GPU
output=model(input)
Error:
I tried a lot of data and finally found the composition rule of this (512 X 1 and 512 X 512):
equivalent to
and left = (卷积层最后的输出通道数*batch_size*池化尺寸 X 池化尺寸)
and right = (第一个全连接层的输入尺寸 X 第一个全连接层的输出尺寸)
where
batch_size就是操作数据集DataLoader时候的自定义的batch_size,我的值为1
第一个全连接层的输入尺寸=池化尺寸*池化尺寸*卷积层最后的输出通道数
If you take a closer look, you will find that the equation formed by the left and right sides of and must at least be satisfied before it 池化尺寸==第一个全连接层的输入尺寸
can be multiplied, and these two values have no logical relationship. Taking a closer look, I found that only the batch_size on both sides of the equation is self-defined, and the other values are fixed in the network structure and can be regarded as constants , so we only need to expand all the formulas on the left side of and , and then place it batch_size
on the left side of X , and put it 池化尺寸*池化尺寸*卷积层最后的输出通道数
on the right side of X, so that it does not satisfy the multiplication of constants that are always identical by multiplying the front row and the back column. At this point
and left = (batch_size X 池化尺寸*池化尺寸*卷积层最后的输出通道数)
and right = (池化尺寸*池化尺寸*卷积层最后的输出通道数 X 第一个全连接层的输出尺寸)
solve the problem
In the code, in forward
the method, avgpool
after the average pooling module, classifier
add before the full connection module
input=input.view(-1,512)
In my code, just cancel that comment.
Explain view
the function in detail: -1 is the calculated auto-fill value. From the above analysis, we know that this value is batch_size, and 512 is the constant value of the network model, so it is correct to change it to this:
input=input.view(batch_size,512)
3. Training process
The training process is not optimized, but the overall framework is as large as this
import time
import torch
# 损失函数
loss_fn=nn.CrossEntropyLoss()
# 优化器
lr=0.01
#每n次epoch更新一次学习率
step_size=2
# momentum(float)-动量因子
optimizer=optim.SGD(model.parameters(),lr=lr,momentum=0.8,weight_decay=0.001)
schedule=optim.lr_scheduler.StepLR(optimizer,step_size=step_size,gamma=0.5,last_epoch=-1)
train_step=0
vali_step=0
writer=SummaryWriter('./logs')
epoch=5
for i in range(1,epoch+1):
starttime=time.time()
train_acc=0
# -----------------------------训练过程------------------------------
for data in train_loader:
img,tar=data
img=img.to(device)
tar=tar.to(device)
outputs=model(img)
train_loss=loss_fn(outputs,tar)
# print(outputs.argmax(1),tar)
train_acc+=sum(outputs.argmax(1)==tar)/batch_size
# 优化器优化模型
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
train_step+=1
if train_step%5000==0:
print("train_step",train_step)
vali_loss=0
vali_acc=0
# -----------------------------验证过程------------------------------
with torch.no_grad():
for vali_data in valid_loader:
img,tar=vali_data
img=img.to(device)
tar=tar.to(device)
outputs=model(img)
vali_loss+=loss_fn(outputs,tar)
vali_acc+=sum(outputs.argmax(1)==tar)/batch_size
vali_step+=1
if vali_step%2000==0:
print("vali_step",vali_step)
endtime=time.time()
spendtime=endtime-starttime
ave_train_acc=train_acc/(len(train_loader))
ave_vali_loss=vali_loss/(len(valid_loader))
ave_vali_acc=vali_acc/(len(valid_loader))
# 训练次数:每一个epoch就是所有train的图跑一遍:1968*3/batch_size,每次batch_size张图
print("Epoch {}/{} : train_step={}, vali_step={}, spendtime={}s".format(i,epoch,train_step,vali_step,spendtime))
print("ave_train_acc={}, ave_vali_acc={}".format(ave_train_acc,ave_vali_acc))
print("train_loss={}, ave_vali_loss={} \n".format(train_loss,ave_vali_loss))
# tensorboard --logdir=logs
with SummaryWriter('./logs/ave_train_acc') as writer:
writer.add_scalar('Acc', ave_train_acc, i)
with SummaryWriter('./logs/ave_vali_acc') as writer:
writer.add_scalar('Acc', ave_vali_acc, i)
with SummaryWriter('./logs/train_loss') as writer:
writer.add_scalar('Loss', train_loss, i)
with SummaryWriter('./logs/ave_vali_loss') as writer:
writer.add_scalar('Loss', ave_vali_loss, i)
writer.close()
training result
The training results are still very good, and will be updated after optimization