代码地址:https://github.com/uzh-rpg/deep_uncertainty_estimation
已训练resnet18
python train.py --model_name resnet18
(注意:此时会报错,只要在tran.py中加入
from models.resnet_dropout import ResNet18Dropout
from models_adf.resnet_adf import ResNet18ADF
两句import即可。)
训练结果保存在.\checkpoint\ckpt_resnet18.pth
测试
python eval.py -r -b --load_model_name resnet18 --test_model_name resnet18_dropout_adf
(此时产生疑问,resnet18的参数可以用到adf中吗?虽然已经阅读过很多次论文,但还是需要看懂adf的代码实现)
==> Preparing data…
Files already downloaded and verified
Files already downloaded and verified
==> Building model…
==> Resuming from checkpoint…
Loaded checkpoint at location ./checkpoint/ckpt_resnet18.pth
==> Loaded model statistics:
test_model_name = resnet18_dropout_adf
load_model_name = resnet18
@epoch = 345
best_acc = 95.07
==> Selected parameters:
use_mcdo = False
num_samples = 10
p = 0.2
min_variance = 0.0001
noise_variance = 0.0001
tau = 0.0001
==> Starting evaluation…
Accuracy = 94.97
Brier Score = 0.008060044976323844
Negative log-likelihood = 198185.5651
Time = 6.990173101425171
上面引用是eval的输出,发现自己没有保存train的输出,不能对比resnet18和resnet18_adf。所以想:
python eval.py -r -b --load_model_name resnet18 --test_model_name resnet18
发现不能,eval必须带adf。
论文回看
又回头看论文,妙啊!
该方法相当于没有改变任何“可学习的参数”,而是只改变了中间激活,因此可以直接用已有训练好的参数。
在网络参数具体传播的时候,通道数不会发生变化吗?网络结构不会变化吗?
adf代码实现
eval.py
在eval.py中,模型初始化如下:
# Model
if args.verbose: print('==> Building model...')
def model_loader():
model = {
'resnet18': ResNet18,
'resnet18_dropout': ResNet18Dropout,
'resnet18_adf': ResNet18ADF,
'resnet18_dropout_adf': ResNet18ADFDropout,
}
params = {
'resnet18': [],
'resnet18_dropout': [args.p],
'resnet18_adf': [args.noise_variance, args.min_variance],
'resnet18_dropout_adf': [args.p, args.noise_variance, args.min_variance],
}
return model[args.test_model_name.lower()](*params[args.test_model_name.lower()])
net = model_loader().to(device)
即:
net = ResNet18ADFDropout(args.p, args.noise_variance, args.min_variance).to(device)
后续导入已训练好的模型参数:
if args.verbose: print('==> Resuming from checkpoint..')
assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
model_to_load = args.load_model_name.lower()
# if model_to_load.endswith('adf'):
# model_to_load = model_to_load[0:-4]
ckpt_path = './checkpoint/ckpt_{}.pth'.format(model_to_load)
checkpoint = torch.load(ckpt_path)
if args.verbose: print('Loaded checkpoint at location {}'.format(ckpt_path))
net.load_state_dict(checkpoint['net'])
best_acc = checkpoint['acc']
start_epoch = checkpoint['epoch']
这里,实验中导入的是resnet18的参数,模型是resnet18_dropout_adf,不同的模型也可以直接导入吗?
后续开始evaluate:
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(testloader):
inputs, targets = inputs.to(device), targets.to(device)
outputs_mean, data_variance, model_variance = compute_preds(net, inputs, use_adf, use_mcdo)
if data_variance is not None and model_variance is not None:
outputs_variance = data_variance + model_variance
elif data_variance is not None:
outputs_variance = data_variance
elif model_variance is not None:
outputs_variance = model_variance + args.tau
one_hot_targets = one_hot_pred_from_label(outputs_mean, targets)
# Compute negative log-likelihood (if variance estimate available)
if outputs_variance is not None:
batch_log_likelihood = compute_log_likelihood(outputs_mean, one_hot_targets, outputs_variance)
batch_neg_log_likelihood = -batch_log_likelihood
# Sum along batch dimension
neg_log_likelihood += torch.sum(batch_neg_log_likelihood, 0).cpu().numpy().item()
# Compute brier score
batch_brier_score = compute_brier_score(outputs_mean, one_hot_targets)
# Sum along batch dimension
brier_score += torch.sum(batch_brier_score, 0).cpu().numpy().item()
# Compute loss
loss = criterion(outputs_mean, targets)
test_loss += loss.item()
# Compute predictions and numer of correct predictions
_, predicted = outputs_mean.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
if args.show_bar and args.verbose:
progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
% (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
其中,计算预测值:
def compute_preds(net, inputs, use_adf=False, use_mcdo=False):
model_variance = None
data_variance = None
def keep_variance(x, min_variance):
return x + min_variance
keep_variance_fn = lambda x: keep_variance(x, min_variance=args.min_variance)
softmax = nn.Softmax(dim=1)
adf_softmax = adf.Softmax(dim=1, keep_variance_fn=keep_variance_fn)
net.eval()
if use_mcdo:
net = set_training_mode_for_dropout(net, True)
outputs = [net(inputs) for i in range(args.num_samples)]
if use_adf:
outputs = [adf_softmax(*outs) for outs in outputs]
outputs_mean = [mean for (mean, var) in outputs]
data_variance = [var for (mean, var) in outputs]
data_variance = torch.stack(data_variance)
data_variance = torch.mean(data_variance, dim=0)
else:
outputs_mean = [softmax(outs) for outs in outputs]
outputs_mean = torch.stack(outputs_mean)
model_variance = torch.var(outputs_mean, dim=0)
# Compute MCDO prediction
outputs_mean = torch.mean(outputs_mean, dim=0)
else:
outputs = net(inputs)
if adf:
outputs_mean, data_variance = adf_softmax(*outputs)
else:
outputs_mean = outputs
net = set_training_mode_for_dropout(net, False)
return outputs_mean, data_variance, model_variance
这里的定义很有意思,它把data_variance计算为adf_softmax算出来的variance,把model_variance计算为多次采样的mean的方差。(根据我的理解,data_variance应该是net输出的outputs,它又算了一个log_norm)其中,adf_softmax实现:
class Softmax(nn.Module):
def __init__(self, dim=1, keep_variance_fn=None):
super(Softmax, self).__init__()
self.dim = dim
self._keep_variance_fn = keep_variance_fn
def forward(self, features_mean, features_variance, eps=1e-5):
"""Softmax function applied to a multivariate Gaussian distribution.
It works under the assumption that features_mean and features_variance
are the parameters of a the indepent gaussians that contribute to the
multivariate gaussian.
Mean and variance of the log-normal distribution are computed following
https://en.wikipedia.org/wiki/Log-normal_distribution."""
log_gaussian_mean = features_mean + 0.5 * features_variance
log_gaussian_variance = 2 * log_gaussian_mean
log_gaussian_mean = torch.exp(log_gaussian_mean)
log_gaussian_variance = torch.exp(log_gaussian_variance)
log_gaussian_variance = log_gaussian_variance*(torch.exp(features_variance)-1)
constant = torch.sum(log_gaussian_mean, dim=self.dim) + eps
constant = constant.unsqueeze(self.dim)
outputs_mean = log_gaussian_mean/constant
outputs_variance = log_gaussian_variance/(constant**2)
if self._keep_variance_fn is not None:
outputs_variance = self._keep_variance_fn(outputs_variance)
return outputs_mean, outputs_variance
resnet_adf_dropout.py
eval中调用的是:
def ResNet18ADFDropout(p=0.2, noise_variance=1e-3, min_variance=1e-3):
return ResNetADFDropout(BasicBlock,
[2,2,2,2],
num_classes=10,
p=p,
noise_variance=noise_variance,
min_variance=min_variance,
initialize_msra=False)
其中ResNetADFDropout为:
class ResNetADFDropout(nn.Module):
def __init__(self, block, num_blocks, num_classes=10, p=0.2, noise_variance=1e-3, min_variance=1e-3, initialize_msra=False):
super(ResNetADFDropout, self).__init__()
self.keep_variance_fn = lambda x: keep_variance(x, min_variance=min_variance)
self._noise_variance = noise_variance
self.in_planes = 64
self.conv1 = adf.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False, keep_variance_fn=self.keep_variance_fn)
self.bn1 = adf.BatchNorm2d(64, keep_variance_fn=self.keep_variance_fn)
self.ReLU = adf.ReLU(keep_variance_fn=self.keep_variance_fn)
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1, p=p, keep_variance_fn=self.keep_variance_fn)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2, p=p, keep_variance_fn=self.keep_variance_fn)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2, p=p, keep_variance_fn=self.keep_variance_fn)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2, p=p, keep_variance_fn=self.keep_variance_fn)
self.linear = adf.Linear(512*block.expansion, num_classes, keep_variance_fn=self.keep_variance_fn)
self.AvgPool2d = adf.AvgPool2d(keep_variance_fn=self.keep_variance_fn)
self.dropout = adf.Dropout(p=p, keep_variance_fn=self.keep_variance_fn)
def _make_layer(self, block, planes, num_blocks, stride, p=0.2, keep_variance_fn=None):
strides = [stride] + [1]*(num_blocks-1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride, p=p, keep_variance_fn=self.keep_variance_fn))
self.in_planes = planes * block.expansion
return adf.Sequential(*layers)
def forward(self, x):
inputs_mean = x
inputs_variance = torch.zeros_like(inputs_mean) + self._noise_variance
x = inputs_mean, inputs_variance
out = self.dropout(*self.ReLU(*self.bn1(*self.conv1(*x))))
out = self.layer1(*out)
out = self.layer2(*out)
out = self.layer3(*out)
out = self.layer4(*out)
out = self.AvgPool2d(*out, 4)
out_mean = out[0].view(out[0].size(0), -1) # Flatten
out_var = out[1].view(out[1].size(0), -1)
out = out_mean, out_var
out = self.linear(*out)
return out
其中_make_layer():
def _make_layer(self, block, planes, num_blocks, stride, p=0.2, keep_variance_fn=None):
strides = [stride] + [1]*(num_blocks-1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride, p=p, keep_variance_fn=self.keep_variance_fn))
self.in_planes = planes * block.expansion
return adf.Sequential(*layers)
(一种猜想:虽然网络结构变化,但可学习的参数没有变化,所以仍然可以load_model)