Contents :
1, based on prediction image sequence crnn code for -pytorch - loads its own data set
2, Based Prediction -pytorch crnn image sequence code - model described in
3, based on the prediction image sequence crnn code for -pytorch - training process and common errors
In an example where VGG_LSTM, Adam optimization algorithm is selected, a loss function CrossEntropyLoss (), detailed training code is as follows:
if __name__ == "__main__":
model = VGG_LSTM()
print(model)
if torch.cuda.is_available():
model.cuda()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_func = nn.CrossEntropyLoss()
for epoch in range(100):
print('epoch {}'.format(epoch + 1))
train_loss = 0.
train_acc = 0.
for batch_x, batch_y in train_loader:
# print(batch_x.size())
batch_x, batch_y = Variable(batch_x).cuda(), Variable(batch_y).cuda()
out = model(batch_x)
# print(batch_x.size())
loss = loss_func(out, batch_y)
train_loss += loss.data[0]
pred = torch.max(out, 1)[1]
train_correct = (pred == batch_y).sum()
train_acc += train_correct.data[0]
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('Train Loss: {:.6f}, Acc: {:.6f}'.format(train_loss / (len(
train_data)), train_acc / (len(train_data))))
# -----------------------evaluation--------------------------------
model.eval()
eval_loss = 0.
eval_acc = 0.
for batch_x, batch_y in test_loader:
batch_x, batch_y = Variable(batch_x, volatile=True).cuda(), Variable(batch_y, volatile=True).cuda()
out = model(batch_x)
loss = loss_func(out, batch_y)
eval_loss += loss.data[0]
pred = torch.max(out, 1)[1]
num_correct = (pred == batch_y).sum()
eval_acc += num_correct.data[0]
print('Test Loss: {:.6f}, Acc: {:.6f}'.format(eval_loss / (len(
test_data)), eval_acc / (len(test_data))))
The whole process, including loading data, the detailed model training code is as follows:
import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.autograd import Variable
import torchvision.models as models
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import torch.optim as optim
BATCH_SIZE = 4
learning_rate = 0.0001
train_transforms = transforms.Compose([
transforms.RandomResizedCrop(224),
# transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
# transforms.Normalize((.5, .5, .5), (.5, .5, .5))
])
val_transforms = transforms.Compose([
# transforms.Resize(256),
transforms.RandomResizedCrop(224),
transforms.ToTensor(),
# transforms.Normalize((.5, .5, .5), (.5, .5, .5))
])
def default_loader(path):
return Image.open(path).convert('RGB')
class MyDataset(Dataset):
def __init__(self, txt, transform=None, target_transform=None, loader=default_loader):
fh = open(txt, 'r')
imgs = []
for line in fh:
line = line.strip('\n')
line = line.rstrip()
words = line.split()
imgs.append((words[0], int(words[1])))
imgs.sort(key=lambda x: x[0], reverse=False)
self.num_samples = len(imgs)
self.num_samples_per_iteration = 9
self.imgs = imgs
self.transform = transform
self.target_transform = target_transform
self.loader = loader
def __getitem__(self, index):
current_index = np.random.choice(range(self.num_samples_per_iteration, self.num_samples))
current_imgs = []
current_label = self.imgs[current_index][1]
for i in range(current_index - self.num_samples_per_iteration, current_index):
fn, label = self.imgs[i]
img = self.loader(fn)
if self.transform is not None:
img = self.transform(img)
current_imgs.append(img)
batch_cur_imgs = np.stack(current_imgs, axis=0) # [9, 3, 256, 256]
return batch_cur_imgs, current_label
def __len__(self):
return len(self.imgs)
train_data = MyDataset(txt='trainset256.txt', transform=train_transforms)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False)
test_data = MyDataset(txt='testset256.txt', transform=val_transforms)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)
print('num_of_trainData:', len(train_data))
print('num_of_testData:', len(test_data))
class VGG_LSTM(nn.Module):
def __init__(self, lstm_hidden_size=256, num_lstm_layers=1, bidirectional=True):
super(VGG_LSTM, self).__init__()
net = models.vgg16(pretrained=True)
net.classifier = nn.Sequential()
self.num_directions = 2 if bidirectional else 1
self.num_lstm_layers = num_lstm_layers
self.lstm_hidden_size = lstm_hidden_size
# [B, 3, 224, 224] -> [B, 512, 7, 7]
self.features = net
self.lstm1 = nn.LSTM(input_size=512 * 7 * 7,
hidden_size=lstm_hidden_size,
num_layers=num_lstm_layers,
batch_first=True,
dropout=0.5,
bidirectional=bidirectional) # [B, 7, lstm_hidden_size]
self.linear1 = nn.Sequential(nn.Linear(lstm_hidden_size * self.num_directions * num_lstm_layers, 64),
nn.ReLU(inplace=True))
self.output_layer = nn.Linear(64, 3)
def init_hidden(self, x):
batch_size = x.size(0)
h = x.data.new(
self.num_directions * self.num_lstm_layers, batch_size, self.lstm_hidden_size).zero_()
c = x.data.new(
self.num_directions * self.num_lstm_layers, batch_size, self.lstm_hidden_size).zero_()
return Variable(h).cuda(), Variable(c).cuda()
def forward(self, x):
# x shape: [B, 9, 3, 224, 224]
B = x.size(0)
x = x.view(B * 9, 3, 224, 224)
output = self.features(x) # [B*9, 512, 7, 7]
output = output.view(B * 9, -1).transpose(0, 1).contiguous().view(512 * 7 * 7, B, 9)
output = output.permute(1, 2, 0) # -> [B, 9, 512*7*7]
h, c = self.init_hidden(output)
output, (h, c) = self.lstm1(output, (h, c)) # h: (num_layers * num_directions, batch, lstm_hidden_size)
h = h.transpose_(0, 1).contiguous().view(B, -1) # -> [B, num_layers * num_directions*lstm_hidden_size]
output = self.linear1(h) # [B, 64]
output = self.output_layer(output) # [B, 3]
return output
if __name__ == "__main__":
model = VGG_LSTM()
print(model)
if torch.cuda.is_available():
model.cuda()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_func = nn.CrossEntropyLoss()
for epoch in range(100):
print('epoch {}'.format(epoch + 1))
train_loss = 0.
train_acc = 0.
for batch_x, batch_y in train_loader:
# print(batch_x.size())
batch_x, batch_y = Variable(batch_x).cuda(), Variable(batch_y).cuda()
out = model(batch_x)
# print(batch_x.size())
loss = loss_func(out, batch_y)
train_loss += loss.data[0]
pred = torch.max(out, 1)[1]
train_correct = (pred == batch_y).sum()
train_acc += train_correct.data[0]
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('Train Loss: {:.6f}, Acc: {:.6f}'.format(train_loss / (len(
train_data)), train_acc / (len(train_data))))
# -----------------------evaluation--------------------------------
model.eval()
eval_loss = 0.
eval_acc = 0.
for batch_x, batch_y in test_loader:
batch_x, batch_y = Variable(batch_x, volatile=True).cuda(), Variable(batch_y, volatile=True).cuda()
out = model(batch_x)
loss = loss_func(out, batch_y)
eval_loss += loss.data[0]
pred = torch.max(out, 1)[1]
num_correct = (pred == batch_y).sum()
eval_acc += num_correct.data[0]
print('Test Loss: {:.6f}, Acc: {:.6f}'.format(eval_loss / (len(
test_data)), eval_acc / (len(test_data))))
Common mistakes :
1, error: size mismatch, as shown below:
Solution : This error is reported to be a mismatch for the dimension between the convolution of the layers. To clear every step of the input and output size, you can debug view each step size. There is LSTM input and output formats, reference may be pytorch official website.
2, error: out of memor, as shown below:
Solution :
First, the tone value of the small batch_size;
Second, enter the picture becomes smaller, you can use resize;
Third, change the video card.
Also note that not convolution part of the network the more the better, as in my task, convolution partial migration VGG, RESNET accuracy decreased. So we need to adjust the structure of the network according to specific tasks.