PyTorch分布式训练踩坑记

这几天在看PyTorch分布式训练,今天才算跑通一个MNIST数据集分布式训练,单机多卡、单机单卡也许只需要改几行代码就可以了,但多机多卡却有很多坑,所以做个简单记录

参考资料

可以说,这几天查看了很多资料,也发现很多博客很鸡肋,直接复制粘贴连图都没有复制完全,又或者不知所云。下面总结下用到的几个较好的博客:
PyTorch分布式训练简明教程
PyTorch分布式训练
pytorch多节点分布式训练
pytorch 分布式多卡训练DistributedDataParallel 踩坑记
PyTorch并行与分布式(二)分布式通信包torch.distributed
pytorch多gpu训练,单机多卡,多机多卡
IndexError: invalid index of a 0-dim tensor. Use tensor.item() to convert a 0-dim tensor to a Python

代码

import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist
from apex.parallel import DistributedDataParallel as DDP
from apex import amp

class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nodes', default=2,
                        type=int, metavar='N')
    parser.add_argument('-g', '--gpus', default=2, type=int,
                        help='number of gpus per node')
    parser.add_argument('-nr', '--nr', default=0, type=int,
                        help='ranking within the nodes')
    parser.add_argument('--epochs', default=16, type=int, 
                        metavar='N',
                        help='number of total epochs to run')
    parser.add_argument('--init-method', type=str, default='tcp://23.23.23.23:3348')	# 主节点

    args = parser.parse_args()
    #########################################################
    args.world_size = args.gpus * args.nodes                #
    #os.environ['MASTER_ADDR'] = 'xx.xx.xx.xx'              #
    #os.environ['MASTER_PORT'] = '4488'                     #
    mp.spawn(train, nprocs=args.gpus, args=(args,))         #
    #########################################################

def train(gpu, args):
    ############################################################
    rank = args.nr * args.gpus + gpu	                          
    dist.init_process_group(                                   
    	backend='nccl',                                         
   		init_method= args.init_method,                                   
    	world_size=args.world_size,                              
    	rank=rank                                               
    )                                                          
    ############################################################
    
    torch.manual_seed(0)
    model = ConvNet()
    torch.cuda.set_device(gpu)
    model.cuda(gpu)
    batch_size = 100
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    
    ###############################################################
    # Wrap the model
    model = nn.parallel.DistributedDataParallel(model,
                                                device_ids=[gpu])
    ###############################################################

    DOWNLOAD_MNIST = False
    # print("Start!")    
    if not (os.path.exists('./mnist/')) or not os.listdir('./mnist/'):
        DOWNLOAD_MNIST = True
    # Data loading code
    kwargs = {
    
    'num_workers': 4, 'pin_memory': True}
    train_dataset = torchvision.datasets.MNIST(
        root='./mnist/',
        train=True,
        transform=transforms.ToTensor(),
        download=DOWNLOAD_MNIST,
    )                             
    # print(DOWNLOAD_MNIST,"ok!")                  
    ################################################################
    train_sampler = torch.utils.data.distributed.DistributedSampler(
    	train_dataset,
    	num_replicas=args.world_size,
    	rank=rank
    )
    ################################################################

    train_loader = torch.utils.data.DataLoader(
    	dataset=train_dataset,
       batch_size=batch_size,
    ##############################
       shuffle=False,            #
    #############################
      sampler=train_sampler,
        **kwargs)    # 
    #############################
    start = datetime.now()
    total_step = len(train_loader)
    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (i + 1) % 100 == 0 and gpu == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                    epoch + 1, 
                    args.epochs, 
                    i + 1, 
                    total_step,
                    loss.item())
                   )
    if gpu == 0:
        print("Training complete in: " + str(datetime.now() - start))

    test_loader = torch.utils.data.DataLoader(
        torchvision.datasets.MNIST('./mnist/', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
        batch_size=batch_size, shuffle=True, **kwargs)
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data, target = data.cuda(), target.cuda()
        data, target = torch.autograd.Variable(data, volatile=True), torch.autograd.Variable(target)
        output = model(data)
        test_loss += torch.nn.functional.nll_loss(output, target, size_average=False).item() # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


if __name__ == '__main__':
    main()

结果

(主机:V100 * 2,分节点:2080Ti*2)

多机多GPU分布式训练

在这里插入图片描述

单机多GPU训练

在这里插入图片描述

对比

其实从上面结果也可以知道,分布式训练速度确实比单机快,步长缩短一般,训练时间47s,但在准确率方面仅为87%,单机的为94%,另外分布式收敛更慢,Loss下降慢,可以看到16轮训练后分布式其Loss为0.70,而单机的为0.30。因此对于相同精度,分布式训练需要再进行多次迭代才能达到。因为最近参加的一个比赛对时间有很高要求,单机V100*2吃满训练两轮都需要24小时,因此考虑使用分布式,现在把mnist样例跑通了,下一步就是把分布式应用到比赛中,但其精度下降的有点厉害,有点担心训练的实际效果,不过怎样,先弄出来再看看。

猜你喜欢

转载自blog.csdn.net/Geek_/article/details/113663720
今日推荐