Based crnn image sequence predicted -pytorch code - the training process and common errors

Contents :
1, based on prediction image sequence crnn code for -pytorch - loads its own data set
2, Based Prediction -pytorch crnn image sequence code - model described in
3, based on the prediction image sequence crnn code for -pytorch - training process and common errors

In an example where VGG_LSTM, Adam optimization algorithm is selected, a loss function CrossEntropyLoss (), detailed training code is as follows:

if __name__ == "__main__":
    model = VGG_LSTM()
    print(model)
    if torch.cuda.is_available():
        model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_func = nn.CrossEntropyLoss()
    for epoch in range(100):
        print('epoch {}'.format(epoch + 1))
        train_loss = 0.
        train_acc = 0.
        for batch_x, batch_y in train_loader:
            # print(batch_x.size())
            batch_x, batch_y = Variable(batch_x).cuda(), Variable(batch_y).cuda()
            out = model(batch_x)
            # print(batch_x.size())
            loss = loss_func(out, batch_y)
            train_loss += loss.data[0]
            pred = torch.max(out, 1)[1]
            train_correct = (pred == batch_y).sum()
            train_acc += train_correct.data[0]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('Train Loss: {:.6f}, Acc: {:.6f}'.format(train_loss / (len(
            train_data)), train_acc / (len(train_data))))



        # -----------------------evaluation--------------------------------
        model.eval()
        eval_loss = 0.
        eval_acc = 0.
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = Variable(batch_x, volatile=True).cuda(), Variable(batch_y, volatile=True).cuda()
            out = model(batch_x)
            loss = loss_func(out, batch_y)
            eval_loss += loss.data[0]
            pred = torch.max(out, 1)[1]
            num_correct = (pred == batch_y).sum()
            eval_acc += num_correct.data[0]
        print('Test Loss: {:.6f}, Acc: {:.6f}'.format(eval_loss / (len(
            test_data)), eval_acc / (len(test_data))))

The whole process, including loading data, the detailed model training code is as follows:

import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.autograd import Variable
import torchvision.models as models
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import torch.optim as optim

BATCH_SIZE = 4
learning_rate = 0.0001

train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224),
    # transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    # transforms.Normalize((.5, .5, .5), (.5, .5, .5))
])
val_transforms = transforms.Compose([
    # transforms.Resize(256),
    transforms.RandomResizedCrop(224),
    transforms.ToTensor(),
    # transforms.Normalize((.5, .5, .5), (.5, .5, .5))
])

def default_loader(path):
    return Image.open(path).convert('RGB')


class MyDataset(Dataset):
    def __init__(self, txt, transform=None, target_transform=None, loader=default_loader):
        fh = open(txt, 'r')
        imgs = []
        for line in fh:
            line = line.strip('\n')
            line = line.rstrip()
            words = line.split()
            imgs.append((words[0], int(words[1])))
        imgs.sort(key=lambda x: x[0], reverse=False)
        self.num_samples = len(imgs)
        self.num_samples_per_iteration = 9
        self.imgs = imgs
        self.transform = transform
        self.target_transform = target_transform
        self.loader = loader

    def __getitem__(self, index):
        current_index = np.random.choice(range(self.num_samples_per_iteration, self.num_samples))
        current_imgs = []
        current_label = self.imgs[current_index][1]
        for i in range(current_index - self.num_samples_per_iteration, current_index):
            fn, label = self.imgs[i]
            img = self.loader(fn)
            if self.transform is not None:
                img = self.transform(img)
            current_imgs.append(img)
        batch_cur_imgs = np.stack(current_imgs, axis=0)  # [9, 3, 256, 256]
        return batch_cur_imgs, current_label

    def __len__(self):
        return len(self.imgs)


train_data = MyDataset(txt='trainset256.txt', transform=train_transforms)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False)

test_data = MyDataset(txt='testset256.txt', transform=val_transforms)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)
print('num_of_trainData:', len(train_data))
print('num_of_testData:', len(test_data))


class VGG_LSTM(nn.Module):
    def __init__(self, lstm_hidden_size=256, num_lstm_layers=1, bidirectional=True):
        super(VGG_LSTM, self).__init__()
        net = models.vgg16(pretrained=True)
        net.classifier = nn.Sequential()
        self.num_directions = 2 if bidirectional else 1
        self.num_lstm_layers = num_lstm_layers
        self.lstm_hidden_size = lstm_hidden_size
        # [B, 3, 224, 224] -> [B, 512, 7, 7]
        self.features = net
        self.lstm1 = nn.LSTM(input_size=512 * 7 * 7,
                             hidden_size=lstm_hidden_size,
                             num_layers=num_lstm_layers,
                             batch_first=True,
                             dropout=0.5,
                             bidirectional=bidirectional)  # [B, 7, lstm_hidden_size]
        self.linear1 = nn.Sequential(nn.Linear(lstm_hidden_size * self.num_directions * num_lstm_layers, 64),
                                     nn.ReLU(inplace=True))
        self.output_layer = nn.Linear(64, 3)

    def init_hidden(self, x):
        batch_size = x.size(0)
        h = x.data.new(
                self.num_directions * self.num_lstm_layers, batch_size, self.lstm_hidden_size).zero_()
        c = x.data.new(
                self.num_directions * self.num_lstm_layers, batch_size, self.lstm_hidden_size).zero_()
        return Variable(h).cuda(), Variable(c).cuda()

    def forward(self, x):
        # x shape: [B, 9, 3, 224, 224]
        B = x.size(0)
        x = x.view(B * 9, 3, 224, 224)
        output = self.features(x)  # [B*9, 512, 7, 7]
        output = output.view(B * 9, -1).transpose(0, 1).contiguous().view(512 * 7 * 7, B, 9)
        output = output.permute(1, 2, 0)  # -> [B, 9, 512*7*7]
        h, c = self.init_hidden(output)
        output, (h, c) = self.lstm1(output, (h, c))  # h: (num_layers * num_directions, batch, lstm_hidden_size)
        h = h.transpose_(0, 1).contiguous().view(B, -1)  # -> [B, num_layers * num_directions*lstm_hidden_size]
        output = self.linear1(h)  # [B, 64]
        output = self.output_layer(output)  # [B, 3]
        return output


if __name__ == "__main__":
    model = VGG_LSTM()
    print(model)
    if torch.cuda.is_available():
        model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_func = nn.CrossEntropyLoss()
    for epoch in range(100):
        print('epoch {}'.format(epoch + 1))
        train_loss = 0.
        train_acc = 0.
        for batch_x, batch_y in train_loader:
            # print(batch_x.size())
            batch_x, batch_y = Variable(batch_x).cuda(), Variable(batch_y).cuda()
            out = model(batch_x)
            # print(batch_x.size())
            loss = loss_func(out, batch_y)
            train_loss += loss.data[0]
            pred = torch.max(out, 1)[1]
            train_correct = (pred == batch_y).sum()
            train_acc += train_correct.data[0]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('Train Loss: {:.6f}, Acc: {:.6f}'.format(train_loss / (len(
            train_data)), train_acc / (len(train_data))))



        # -----------------------evaluation--------------------------------
        model.eval()
        eval_loss = 0.
        eval_acc = 0.
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = Variable(batch_x, volatile=True).cuda(), Variable(batch_y, volatile=True).cuda()
            out = model(batch_x)
            loss = loss_func(out, batch_y)
            eval_loss += loss.data[0]
            pred = torch.max(out, 1)[1]
            num_correct = (pred == batch_y).sum()
            eval_acc += num_correct.data[0]
        print('Test Loss: {:.6f}, Acc: {:.6f}'.format(eval_loss / (len(
            test_data)), eval_acc / (len(test_data))))

Common mistakes :
1, error: size mismatch, as shown below:
Here Insert Picture DescriptionSolution : This error is reported to be a mismatch for the dimension between the convolution of the layers. To clear every step of the input and output size, you can debug view each step size. There is LSTM input and output formats, reference may be pytorch official website.
Here Insert Picture DescriptionHere Insert Picture Description
2, error: out of memor, as shown below:
Here Insert Picture Description
Solution :
First, the tone value of the small batch_size;

Second, enter the picture becomes smaller, you can use resize;

Third, change the video card.

Also note that not convolution part of the network the more the better, as in my task, convolution partial migration VGG, RESNET accuracy decreased. So we need to adjust the structure of the network according to specific tasks.

Guess you like

Origin blog.csdn.net/hnu_zzt/article/details/86519448