吴恩达深度学习2.2练习_Improving Deep Neural Networks_Optimization

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/weixin_42432468

学习心得:
1、每周的视频课程看一到两遍
2、做笔记

3、做每周的作业练习,这个里面的含金量非常高。先根据notebook过一遍,掌握后一定要自己敲一遍,这样以后用起来才能得心应手。


1、Load Dataset

2、算法代码实现

2.1、初始化参数

2.2、正向传播相关函数

2.3、计算cost

2.4、反向传播相关函数

2.5、参数更新

3、预测

# import packages
import numpy as np
import matplotlib.pyplot as plt
# from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
# from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
from reg_utils import  load_2D_dataset
import sklearn
import sklearn.datasets
import scipy.io
from testCases_improve_regulariation import *

%matplotlib inline
plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

1、Load Dataset

train_X, train_Y, test_X, test_Y = load_2D_dataset()

png

#查看获得数据集到底是个啥东西,类型、形状、第一个实例
print ('train_X:\n',type(train_X),train_X.shape,'\n')
print (train_X[:,0])
print ('test_X:\n',type(test_X),test_X.shape,'\n')
print (test_X[:,0])
train_X:
 <class 'numpy.ndarray'> (2, 211) 

[-0.158986  0.423977]
test_X:
 <class 'numpy.ndarray'> (2, 200) 

[-0.35306235 -0.67390181]

2、算法代码实现

2.1、初始化参数

def initialize_parameters(layer_dims,initialization='he'):
    
    np.random.seed(3)
    L = len(layer_dims)
    pars = {}
    if initialization == 'zeros':
        for l in range(1,L):
            pars['W'+str(l)] = np.zeros((layer_dims[l],layer_dims[l-1]))
            pars['b'+str(l)] = np.zeros((layer_dims[l],1))
        
    elif initialization == 'random':
        for l in range(1,L):
#             pars['W'+str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])*10
            pars['W'+str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])
            pars['b'+str(l)] = np.zeros((layer_dims[l],1))
          
    elif initialization == 'he':
        for l in range(1,L):
#             pars['W'+str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])* np.sqrt(2./layer_dims[l-1])
            pars['W'+str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])* np.sqrt(1./layer_dims[l-1])
            pars['b'+str(l)] = np.zeros((layer_dims[l],1))
        
    return pars
# test initialize_parameters function
pars_test = initialize_parameters([3,2,1],initialization='he')
print (pars_test)
pars_test = initialize_parameters([3,2,1],initialization='random')
print (pars_test)
{'W1': array([[ 1.03266513,  0.25201908,  0.05571284],
       [-1.07588801, -0.16015015, -0.20482019]]), 'b1': array([[0.],
       [0.]]), 'W2': array([[-0.05850706, -0.44335643]]), 'b2': array([[0.]])}
{'W1': array([[ 1.78862847,  0.43650985,  0.09649747],
       [-1.8634927 , -0.2773882 , -0.35475898]]), 'b1': array([[0.],
       [0.]]), 'W2': array([[-0.08274148, -0.62700068]]), 'b2': array([[0.]])}

2.2、正向传播相关函数

def linear_forward(A,W,b,keep_prob=1,regularization=None):
    
    np.random.seed(1)
    D = np.random.rand(A.shape[0],A.shape[1])
    # this code for dropout
    if regularization == 'dropout':
#         print ('D:\n',D)   #不知为啥在第二次循环的时候D2与课件里面产生的D2不一样,此处造成的差异以至于与与课件最后结果不一样
        D = np.where(D <= keep_prob,1,0)
        A = np.multiply(A,D)
        A = A/keep_prob
    #####################################
    
    Z = np.dot(W,A) + b
    cache = (A,W,b,D)
    
    return Z,cache
#第一次随机产生的数据一样,后面的不一样
np.random.seed(1)  # 放在此处,产生的结果一样
for i in range(3):
#     np.random.seed(1)     # 放在此处,随机数据不一样
    D = np.random.rand(2,3)
    print (D,'\n')

np.random.seed(1)
print ('- '*30)
D = np.random.rand(2,3)
print (D,'\n')
D = np.random.rand(2,3)
print (D,'\n')
D = np.random.rand(2,3)
print (D,'\n')
[[4.17022005e-01 7.20324493e-01 1.14374817e-04]
 [3.02332573e-01 1.46755891e-01 9.23385948e-02]] 

[[0.18626021 0.34556073 0.39676747]
 [0.53881673 0.41919451 0.6852195 ]] 

[[0.20445225 0.87811744 0.02738759]
 [0.67046751 0.4173048  0.55868983]] 

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
[[4.17022005e-01 7.20324493e-01 1.14374817e-04]
 [3.02332573e-01 1.46755891e-01 9.23385948e-02]] 

[[0.18626021 0.34556073 0.39676747]
 [0.53881673 0.41919451 0.6852195 ]] 

[[0.20445225 0.87811744 0.02738759]
 [0.67046751 0.4173048  0.55868983]] 
def sigmoid_forward(Z):
    '''
    arguments:
    x --> 自变量
    
    returns:
    s --> sigmoid(x)
    
    '''
    A = 1./(1+np.exp(-Z))
    cache = Z
    
    return A,cache
def relu_forward(Z):
    '''
    arguments:
    x --> 自变量
    
    returns:
    s --> ReLu(x)
    
    '''
#     s = np.maximum(0.01*x,x)
    A = np.maximum(0,Z)
    cache = Z
    
    return A,cache
def activation_forward(Z,activation):
    
    if activation == 'sigmoid':
        A,cache = sigmoid_forward(Z)
    elif activation == 'relu':
        A,cache = relu_forward(Z)
    
    return A,cache
def linear_activation_forward(A_prev,W,b,activation,keep_prob=1,regularization=None):
    
    Z,linear_cache = linear_forward(A_prev,W,b,keep_prob=keep_prob,regularization=regularization)
    A,activation_cache =  activation_forward(Z,activation)
    cache = (linear_cache,activation_cache)
    
    return A,cache
def L_model_forward(X,pars,keep_prob=1,regularization=None):
    caches = []
    A = X
    L = len(pars)//2 + 1
    np.random.seed(1)
    
    A_prev = A
    A,cache = linear_activation_forward(A_prev,pars['W1'],pars['b1'],activation='relu',keep_prob=1,regularization=None)
    caches.append(cache)
    
#     A_prev = A
#     A,cache = linear_activation_forward(A_prev,pars['W2'],pars['b2'],activation='relu',keep_prob=keep_prob,regularization=regularization)
#     caches.append(cache)
    
    for l in range(2,L-1):
        A_prev = A
        A,cache = linear_activation_forward(A_prev,pars['W'+str(l)],pars['b'+str(l)],activation='relu',keep_prob=keep_prob,regularization=regularization)
        caches.append(cache)
        
    AL,cache = linear_activation_forward(A,pars['W'+str(L-1)],pars['b'+str(L-1)],activation='sigmoid',keep_prob=keep_prob,regularization=regularization)
    caches.append(cache)
    assert(AL.shape == (1,X.shape[1]))

    return AL,caches
X_assess, parameters = forward_propagation_with_dropout_test_case()

A3, cache = L_model_forward(X_assess, parameters, keep_prob = 0.7,regularization='dropout')
print ("A3 = " + str(A3))
A3 = [[0.36974721 0.49683389 0.04565099 0.01446893 0.36974721]]

2.3、计算cost

def compute_cost(AL,Y,pars,lambd=0,regularization=None):
    assert(AL.shape[1] == Y.shape[1])
    
#     cost = -np.mean(Y*np.log(AL)+(1-Y)*np.log(1-AL),axis=1,keepdims=True) # 数组对应位置相乘,矩阵进行矩阵乘法

    m = Y.shape[1]
#     cost = (1./m) * (-np.dot(Y,np.log(AL).T) - np.dot(1-Y, np.log(1-AL).T)) # 一维数组对应位置求乘积求和一步进行,再求均值
#     cost = np.squeeze(cost)
#     print (AL)

    cost = (1./m) * (-np.multiply(Y,np.log(AL)) - np.multiply(1-Y, np.log(1-AL))) # (数组和矩阵)对应位置成绩再求均值,再求和
    cost = np.nansum(cost)        #     np.nansum,序列里面即使还有空值仍然能够进行计算 

    # this code for L2 regularization 
    if regularization == 'L2':
        l2 = 0
        L = int(len(pars)/2)
        for l in range(1,L+1):
            a = np.sum(np.square(pars['W'+str(l)]))
            l2 +=a
        l2 = l2*lambd/m/2
        cost = cost + l2
     ##############################
    
#  三种乘法*,np.dot, np.multiply
    return cost
# test compute_cost with regularization function 
A3, Y_assess, parameters = compute_cost_with_regularization_test_case()
print("cost = " + str(compute_cost(A3, Y_assess, parameters, lambd = 0.1,regularization='L2')))
cost = 1.786485945159076

2.4、反向传播相关函数

def sigmoid_backrward(dA,activation_cache):
    
    Z = activation_cache
    A = 1./(1 + np.exp(-Z))
    dZ = dA*A*(1-A)
    
    return dZ
def relu_backward(dA,activation_cache):
    
    Z = activation_cache
    dZ = np.array(dA,copy=True)
    assert (dZ.shape == Z.shape)
    dZ[Z <= 0] = 0
    
    return dZ
def activation_backward(dA,activation_cache,activation):
    
    if activation == 'sigmoid':
        dZ = sigmoid_backrward(dA,activation_cache)
    elif activation == 'relu':
        dZ = relu_backward(dA,activation_cache)
        
    return dZ
    
def linear_backward(dZ,linear_cache,lambd=0,regularization=None,keep_prob=1):
    
    A_prev, W, b ,D = linear_cache
    m = A_prev.shape[1]
    dA_prev = np.dot(W.T,dZ)
    
    # this code for dropout
    if regularization == 'dropout':
        assert (dA_prev.shape == D.shape)
        dA_prev = np.multiply(dA_prev,D)
        dA_prev = dA_prev/keep_prob
    ######################################
    
    dW = 1./m*np.dot(dZ,A_prev.T)       #没有除以m,导致计算错误
    
    # this code for regularization
    if regularization == 'L2':
        dW = dW + W*lambd/m
    ######################
    
    db = np.mean(dZ,axis=1,keepdims=True)   #应该使用这种方式,效果应该会更好,层数较少,神经元较少使用此种较好
#     db = 1./m * np.sum(dZ)  #这两种方式计算db结果为什么不一样,之前都是这么计算的啊
    # 与课程结果不一致的原因,db的计算方式不一样。
    return dA_prev,dW,db
def activation_linear_backward(dA,cache,activation,lambd=0,regularization=None,keep_prob=1):
    
    linear_cache,activation_cache = cache
    
    dZ = activation_backward(dA,activation_cache,activation)
    dA_prev,dW,db = linear_backward(dZ,linear_cache,lambd=lambd,regularization=regularization,keep_prob=keep_prob)

    return dA_prev,dW,db
def L_model_backward(AL,Y,caches,lambd=0,regularization=None,keep_prob=1):
    
    Y = Y.reshape(AL.shape)
    dAL = -(np.divide(Y,AL) - np.divide(1-Y,1-AL))
    grads = {}
    L = len(caches) + 1
    current_cache = caches[L-2]
    
    grads['dA'+str(L-1)],grads['dW'+str(L-1)],grads['db'+str(L-1)] = activation_linear_backward(dAL,current_cache,activation='sigmoid',lambd=lambd,regularization=regularization,keep_prob=keep_prob)
    for l in reversed(range(L-2)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = activation_linear_backward(grads['dA'+str(l+2)],current_cache,activation='relu',lambd=lambd,regularization=regularization,keep_prob=keep_prob)
        grads["dA" + str(l + 1)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
    
    return grads

2.5、参数更新

def update_parameters(pars,grads,learning_rate):
    
    L = len(pars)//2 + 1
    for l in range(1,L):
        pars['W'+str(l)] = pars['W'+str(l)] - learning_rate*grads['dW'+str(l)]
        pars['b'+str(l)] = pars['b'+str(l)] - learning_rate*grads['db'+str(l)]
    
    return pars

L_layer_model

def L_layer_model(X,Y,layer_dims,learning_rate = 0.01,num_iterations = 3000,print_cost=False,initialization='he',lambd=0,regularization=None,keep_prob = 1):
    
    '''
    1、初始化参数
    2、根据迭代次数循环
        3、正向传播
        4、计算cost
        5、反向传播
        6、更新参数
    7、输出costs和pars
    '''
#     np.random.seed(1)
    
    #初始化参数
    pars = initialize_parameters(layer_dims,initialization)

    L = len(layer_dims)
    costs = []
    for i in range(0,num_iterations):
        
        #正向传播
        AL,caches = L_model_forward(X,pars)

        #计算cost
        cost = compute_cost(AL,Y,pars,lambd=lambd,regularization=regularization) 

        if i%1000 ==0 :
            costs.append(cost)
        if i%10000 ==0 and print_cost:
            print("Cost after iteration %i: %f" %(i, cost))

        #反向传播
        grads = L_model_backward(AL,Y,caches,lambd=lambd,regularization=regularization)
    
        #更新参数
        pars = update_parameters(pars,grads,learning_rate)
        
    plt.figure
    plt.figure(figsize = (30,6.5))
    plt.subplot(1,2,1)
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per thousands)')
    plt.title("Learning rate =" + str(learning_rate))

    return costs,pars
layers_dims = [2,20,3,1] #  3-layer model
# no regularization 
costs_test,pars_test = L_layer_model(train_X, train_Y, layers_dims,learning_rate = 0.3, num_iterations = 30000, print_cost = True,initialization='he')

# L2 regularization
costs_test,pars_test = L_layer_model(train_X, train_Y, layers_dims,learning_rate = 0.3, num_iterations = 30000, print_cost = True,initialization='he',lambd=0.7,regularization='L2')

# # dropout
# costs_test,pars_test = L_layer_model(train_X, train_Y, layers_dims,learning_rate = 0.3, num_iterations = 30000, print_cost = True,initialization='he',lambd=0,regularization='dropout',keep_prob = 0.86)

Cost after iteration 0: 0.655741
Cost after iteration 10000: 0.163300
Cost after iteration 20000: 0.138516
Cost after iteration 0: 0.697448
Cost after iteration 10000: 0.268492
Cost after iteration 20000: 0.268092

png

png

3、预测

在指定learning_rate和num_iterations的情况下得到最优参数,再根据最优参数进行预测

def predict(X, y, parameters):
    """
    This function is used to predict the results of a  L-layer neural network.
    
    Arguments:
    X -- data set of examples you would like to label
    parameters -- parameters of the trained model
    
    Returns:
    p -- predictions for the given dataset X
    """
    
    m = X.shape[1]
    n = len(parameters) // 2 # number of layers in the neural network
    p = np.zeros((1,m))
    
    # Forward propagation
    probas, caches = L_model_forward(X, parameters)

    
    # convert probas to 0/1 predictions
    for i in range(0, probas.shape[1]):
        if probas[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0
    
    #print results
    #print ("predictions: " + str(p))
    #print ("true labels: " + str(y))
    print("Accuracy: "  + str(np.sum((p == y)/m)))
        
    return p
pred_train = predict(train_X, train_Y, pars_test)
pred_test = predict(test_X, test_Y, pars_test)
Accuracy: 0.9383886255924171
Accuracy: 0.9299999999999998
def plot_decision_boundary(model, X, y):
    # Set min and max values and give it some padding
    x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
    y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole grid
    Z = model(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.ylabel('x2')
    plt.xlabel('x1')
    plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral)
    plt.show()
def predict_dec(parameters, X):
    """
    Used for plotting decision boundary.
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    X -- input data of size (m, K)
    
    Returns
    predictions -- vector of predictions of our model (red: 0 / blue: 1)
    """
    
    # Predict using forward propagation and a classification threshold of 0.5
    a3, cache = L_model_forward(X, parameters)
    predictions = (a3>0.5)
    return predictions
# plt.title("Model with He initialization")
axes = plt.gca()
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
plot_decision_boundary(lambda x: predict_dec(pars_test, x.T), train_X, np.squeeze(train_Y))

png

dropout产生的结果与练习notebook里面不一样,其原因之一就是随机产生的D不一致np.random.rand,后续再研究还有哪些原因

有需要全套作业练习notebook及资料的可以加我微信yuhaidong112

猜你喜欢

转载自blog.csdn.net/weixin_42432468/article/details/84000590