深度学习模型调试模型

深度学习调试模型

1.简单训练模型例子

https://zhuanlan.zhihu.com/p/136902153

```
import torch
import torch.nn as nn
import torch.optim as optim

model = nn.Conv2d(3, 64, 3)
optimizer = optim.SGD(model.parameters(), lr=0.5)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2)

for i in range(5):
  optimizer.zero_grad()
  x = model(torch.randn(3, 3, 64, 64))
  loss = x.sum()
  loss.backward()
  print('{} optim: {}'.format(i, optimizer.param_groups[0]['lr']))
  optimizer.step()
  print('{} scheduler: {}'.format(i, lr_scheduler.get_lr()[0]))
  lr_scheduler.step()
```

2.简单训练模型例子2

```python
import torch
import torch.nn as nn
import yaml

class Net(nn.Module):
	def __init__(self):
		super(Net,self).__init__()
		self.linear = nn.Linear(2,3)
	def forward(self,x):	
		x = self.linear(x)
		return x
model = Net()
filename="exp/u2++_conformer/train.yaml"
with open(filename, 'r') as fin:
	configs = yaml.load(fin, Loader=yaml.FullLoader)
optimizer = torch.optim.Adam(model.parameters(), **{'lr':0.001})
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2)
```
  1. espnet模型加载代码

    import torch
    from espnet.nets.pytorch_backend.e2e_asr_transformer import E2E
    from espnet.asr.asr_utils import get_model_conf
    model_path="exp/train_nodev_sp_pytorch_train/results/model.21-30.avg.best"
    idim, odi, args = get_model_conf(model_path, None)
    model_state_dict = torch.load(model_path, map_location=lambda storage, loc:storage)
    model=E2E(idim, odi, args)
    model.load_state_dict(model_state_dict)
    print(model)
    model.__repr__()
    for name, parameters in model.named_parameters():
       print(name, ':', parameters.size())
    
  2. 查看模型结构

    print(model)
    print(list(model.modules()))
    print(model.state_dict)
    
  3. 查看模型参数

    for name, parameter in model.named_parameters():
       print(name, ':', parameter.size())
    
  4. 查看模型梯度

    model.linear.weight.grad

  5. optimizer

    model = torch.nn.Linear(2, 3)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    torch.save(optimiser.state_dict(), 'optimiser.pth')
    optimiser.load_state_dict(torch.load('optimiser.pth'))
    
    
  6. grad clip

    clip = 5
	grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
	if torch.iffinite(grad_norm):
		   optimizer.step()
注解: grad_clip的作用是将所有梯度是视为一个向量,将这个向量的2norm限制为clip。(默认是2norm,就是每一项平方和除以总个数,最后再开方)。返回值为这个向量的norm值。类型为tensor,大小为[]. 也就是说是个标量,非向量。 

如果判断其norm值不是inf,则才进行权重更新,否则因为梯度为inf,则不更新权重。
  1. 获取数据

    GPU上的数据
    best_score.cpu().item()

  2. 遍历checkpoint和模型参数

    
    for key in checkpoint.keys():
        print(key)
    print("#######")
    for name, parameters in model.named_parameters():
        print(name)
    for name, parameters in model.named_parameters():
        if "depthwise_conv" in name:
            print("[wyr debug]")
            print(name)
            print(checkpoint[name].size())
            print(parameters.size())
            print(checkpoint[name])
            print(parameters)
    
    
  3. 模型加载

    import torch
    from espnet.nets.pytorch_backend.e2e_asr_transformer import E2E
    from espnet.asr.asr_utils import get_model_conf
    model_path="exp/train_nodev_sp_pytorch_train/results/model.21-30.avg.best"
    idim, odi, args = get_model_conf(model_path, None)
    model_state_dict = torch.load(model_path, map_location=lambda storage, loc:storage)
    model=E2E(idim, odi, args)
    model.load_state_dict(model_state_dict)
    print(model)
    model.__repr__()
    for name, parameters in model.named_parameters():
      print(name, ':', parameters.size())
    
    
  4. tensoroboard使用

    tensorboard:训练tensorboard保存目录,tensorboard打开方式:tensorboard --logdir checkpoint/	tensorboard/train/,tensorboard查看方式:打开网页输入训练服务器 ip:6006
    
    
  5. 模型加载

wenet模型保存:


    logging.info('Checkpoint: save to checkpoint %s' % path)
    if isinstance(model, torch.nn.DataParallel):
        state_dict = model.module.state_dict()
    elif isinstance(model, torch.nn.parallel.DistributedDataParallel):
        state_dict = model.module.state_dict()
    else:
        state_dict = model.state_dict()
 torch.save(state_dict, path)

wenet模型加载:

    if torch.cuda.is_available():
        logging.info('Checkpoint: loading from checkpoint %s for GPU' % path)
        checkpoint = torch.load(path)
    else:
        logging.info('Checkpoint: loading from checkpoint %s for CPU' % path)
        checkpoint = torch.load(path, map_location='cpu')

FUNASR模型保存:
torch.save(
{
“model”: model.state_dict(),
“reporter”: reporter.state_dict(),
“optimizers”: [o.state_dict() for o in optimizers],
“schedulers”: [
s.state_dict() if s is not None else None
for s in schedulers
],
“scaler”: scaler.state_dict() if scaler is not None else None,
},
output_dir / “checkpoint.pth”,
)

torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth")

FUNASR模型加载:

states = torch.load(
checkpoint,
map_location=f"cuda:{torch.cuda.current_device()}" if ngpu > 0 else “cpu”,
)
model.load_state_dict(states[“model”])

import torch
model_name="checkpoint/1epoch.pth“
CPU加载
checkpoint = torch.load(model_name, map_location=torch.device(‘cpu’))
GPU加载
checkpoint = torch.load(model_name)
for key in checkpoint.keys():
print(key)

猜你喜欢

转载自blog.csdn.net/weixin_43870390/article/details/131068689