Adam梯度下降法
结合了momentum梯度下降法和RMSprop梯度下降法的方法,更加的适用于大多数的情况。
过程
参数
一般的,让 即可。当然,学习速率 要自己调整。
由于结合了动量梯度下降法,所以累加的速度可能很大。那么相对于普通的mini-batch法,学习速率要小很多。
代码:
import numpy as np
import matplotlib.pyplot as plt
import copy
import math
from scipy.io import loadmat, savemat
# 读入样本数据(已经随机化处理)
def getData():
# 测试数据个数
Mtrain = 4500
data = loadmat('data_random.mat')
return data['Xtrain'], data['ytrain'], data['Xtest'], data['ytest']
# 展开成行向量
def dealY(y, siz):
m = y.shape[0]
res = np.mat(np.zeros([m, siz]))
for i in range(m):
res[i, y[i, 0] - 1] = 1
return res
# 可视化数据集
def displayData(X):
m, n = np.shape(X)
width = round(math.sqrt(np.size(X, 1)))
height = int(n / width)
drows = math.floor(math.sqrt(m))
dcols = math.ceil(m / drows)
pad = 1 # 建立一个空白“背景布”
darray = -1 * np.ones((pad + drows * (height + pad), pad + dcols * (width + pad)))
curr_ex = 0
for j in range(drows):
for i in range(dcols):
max_val = np.max(np.abs(X[curr_ex]))
darray[pad + j * (height + pad):pad + j * (height + pad) + height,
pad + i * (width + pad):pad + i * (width + pad) + width] \
= X[curr_ex].reshape((height, width)) / max_val
curr_ex += 1
if curr_ex >= m:
break
plt.imshow(darray.T, cmap='gray')
plt.show()
# 激活函数
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def sigmoid_derivative(z):
return np.multiply(sigmoid(z), 1 - sigmoid(z))
# 神经网络结构
L = 2
S = [400, 25, 10]
def gradientDescent(X, y):
m, n = X.shape
mb_size = 512
tmpX = []
tmpy = []
for i in range(0, m, mb_size):
en = min(i + mb_size, m)
tmpX.append(X[i:en, ])
tmpy.append(y[i:en, ])
X = tmpX
y = tmpy
mb_num = X.__len__()
W = [-1, ]
B = [-1, ]
Vdw = [-1, ]
Vdb = [-1, ]
Sdw = [-1, ]
Sdb = [-1, ]
for i in range(1, L + 1):
W.append(np.mat(np.random.rand(S[i], S[i - 1])) * 0.01)
B.append(np.mat(np.zeros([S[i], 1], float)))
Vdw.append(np.mat(np.zeros([S[i], S[i - 1]], float)))
Sdw.append(np.mat(np.zeros([S[i], S[i - 1]], float)))
Vdb.append(np.mat(np.zeros([S[i], 1], float)))
Sdb.append(np.mat(np.zeros([S[i], 1], float)))
Z = []
for i in range(0, L + 1):
Z.append([])
rate = 1e-2 # 学习速率
lbda = 1e-4 # 惩罚力度
b1 = 0.9
b2 = 0.999
epoch = 1001
for T_epoch in range(epoch):
for k in range(mb_num):
Z[0] = X[k].T
mb_size = X[k].shape[0]
for i in range(1, L + 1):
if (i > 1):
Z[i] = W[i] * sigmoid(Z[i - 1]) + B[i]
else:
Z[i] = W[i] * Z[i - 1] + B[i]
pw = -1
for i in range(L, 0, -1):
if i == L:
dz = sigmoid(Z[L]) - y[k].T
else:
dz = np.multiply(pw.T * dz, sigmoid_derivative(Z[i]))
dw = dz * (sigmoid(Z[i - 1]).T) / mb_size
dw += lbda*W[i]/mb_size
db = np.sum(dz, axis=1) / mb_size
pw = copy.copy(W[i])
Vdw[i] = b1 * Vdw[i] + (1 - b1) * dw
Vdb[i] = b1 * Vdb[i] + (1 - b1) * db
Sdw[i] = b2 * Sdw[i] + (1 - b2) * np.multiply(dw, dw)
Sdb[i] = b2 * Sdb[i] + (1 - b2) * np.multiply(db, db)
W[i] -= rate * np.divide(Vdw[i]/(1-(b1**(T_epoch+1))), np.sqrt(Sdw[i]/(1-(b2**(T_epoch+1)))) + 1e-8)
B[i] -= rate * np.divide(Vdb[i]/(1-(b1**(T_epoch+1))), np.sqrt(Sdb[i]/(1-(b2**(T_epoch+1)))) + 1e-8)
# CostFunction
if (T_epoch % 100 == 0):
Cost = np.sum(np.multiply(y[k].T, np.log(sigmoid(Z[L])))) + \
np.sum(np.multiply(1 - y[k].T, np.log(1 - sigmoid(Z[L]))))
Cost /= -mb_size
Add = 0
for l in range(1, L + 1):
T = np.multiply(W[l], W[l])
Add += np.sum(T)
Cost += Add * lbda / (2 * mb_size)
print(T_epoch, Cost)
savemat('results_minibatch_512siz.mat', mdict={'w1': W[1], 'w2': W[2], 'b1': B[1], 'b2': B[2]})
# 得出预测值
def getAnswer(X):
m, n = X.shape
Answer = []
data = loadmat('results_minibatch_512siz.mat')
W = [-1, np.mat(data['w1']), np.mat(data['w2'])]
B = [-1, np.mat(data['b1']), np.mat(data['b2'])]
Z = X.T
for i in range(1, L + 1):
if i > 1:
Z = sigmoid(Z)
Z = W[i] * Z + B[i]
A = sigmoid(Z)
for I in range(m):
mx = 0
for i in range(0, S[L]):
if A[i, I] > mx:
mx = A[i, I]
id = i + 1
Answer.append(id)
return Answer
if __name__ == "__main__":
Xtrain, ytrain, Xtest, ytest = getData()
mtrain, n = Xtrain.shape
mtest, n = Xtest.shape
print(mtrain, mtest)
Nprin = True
if Nprin:
# 随机展示测试值
index = np.random.choice(np.random.permutation(mtest), 16)
part = Xtest[index] # 随机选16个画出
displayData(part)
print("对应答案值:") # 输出对应答案值
for i in range(16):
print(ytest[index[i], 0], end=' ')
print('')
# 向量化y
ytrainV = dealY(ytrain, 10)
# 训练参数
gradientDescent(Xtrain, ytrainV)
# 获取测试集的预测答案
Htest = getAnswer(Xtest)
if Nprin:
print("对应预测值:") # 输出对应预测值
for i in range(16):
print(Htest[index[i]], end=' ')
print('')
# 统计准确率
ct = 0
for i in range(mtest):
if Htest[i] == ytest[i]:
ct += 1
print("accuracy: ", 100 * ct / mtest, "%")