吴恩达深度学习2.1练习_Improving Deep Neural Networks(Initialization_Regularization_Gradientchecking)

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/weixin_42432468

学习心得:
1、每周的视频课程看一到两遍
2、做笔记

3、做每周的作业练习,这个里面的含金量非常高。先根据notebook过一遍,掌握后一定要自己敲一遍,这样以后用起来才能得心应手。


1、Load Dataset

2、算法代码实现

2.1、初始化参数

2.2、正向传播相关函数

2.3、计算cost

2.4、反向传播相关函数

2.5、梯度检查

2.6、参数更新

3、预测

# import packages
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import scipy.io
from testCases import *

%matplotlib inline
plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

1、Load Dataset

# train_X, train_Y, test_X, test_Y = load_2D_dataset()
# #查看获得数据集到底是个啥东西,类型、形状、第一个实例
# print ('train_X:\n',type(train_X),train_X.shape,'\n')
# print (train_X[:,0])
# print ('test_X:\n',type(test_X),test_X.shape,'\n')
# print (test_X[:,0])

2、算法代码实现

2.1、初始化参数

def initialize_parameters(layer_dims,initialization='he'):
    
    np.random.seed(3)
    L = len(layer_dims)
    pars = {}
    if initialization == 'zeros':
        for l in range(1,L):
            pars['W'+str(l)] = np.zeros((layer_dims[l],layer_dims[l-1]))
            pars['b'+str(l)] = np.zeros((layer_dims[l],1))
        
    elif initialization == 'random':
        for l in range(1,L):
#             pars['W'+str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])*10
            pars['W'+str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])
            pars['b'+str(l)] = np.zeros((layer_dims[l],1))
          
    elif initialization == 'he':
        for l in range(1,L):
#             pars['W'+str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])* np.sqrt(2./layer_dims[l-1])
            pars['W'+str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])* np.sqrt(1./layer_dims[l-1])
            pars['b'+str(l)] = np.zeros((layer_dims[l],1))
        
    return pars
# test initialize_parameters function
pars_test = initialize_parameters([3,2,1],initialization='he')
print (pars_test)
pars_test = initialize_parameters([3,2,1],initialization='random')
print (pars_test)

2.2、正向传播相关函数

def linear_forward(A,W,b,keep_prob=1,regularization=None):
    
    np.random.seed(1)
    D = np.random.rand(A.shape[0],A.shape[1])
    # this code for dropout
    if regularization == 'dropout':
#         print ('D:\n',D)   #不知为啥在第二次循环的时候D2与课件里面产生的D2不一样,此处造成的差异以至于与与课件最后结果不一样
        D = np.where(D <= keep_prob,1,0)
        A = np.multiply(A,D)
        A = A/keep_prob
    #####################################
    
    Z = np.dot(W,A) + b
    cache = (A,W,b,D)
    
    return Z,cache
#第一次随机产生的数据一样,后面的不一样
np.random.seed(1)  # 放在此处,产生的结果一样
for i in range(3):
#     np.random.seed(1)     # 放在此处,随机数据不一样
    D = np.random.rand(2,3)
    print (D,'\n')

np.random.seed(1)
print ('- '*30)
D = np.random.rand(2,3)
print (D,'\n')
D = np.random.rand(2,3)
print (D,'\n')
D = np.random.rand(2,3)
print (D,'\n')
def sigmoid_forward(Z):
    '''
    arguments:
    x --> 自变量
    
    returns:
    s --> sigmoid(x)
    
    '''
    A = 1./(1+np.exp(-Z))
    cache = Z
    
    return A,cache
def relu_forward(Z):
    '''
    arguments:
    x --> 自变量
    
    returns:
    s --> ReLu(x)
    
    '''
#     s = np.maximum(0.01*x,x)
    A = np.maximum(0,Z)
    cache = Z
    
    return A,cache
def activation_forward(Z,activation):
    
    if activation == 'sigmoid':
        A,cache = sigmoid_forward(Z)
    elif activation == 'relu':
        A,cache = relu_forward(Z)
    
    return A,cache
def linear_activation_forward(A_prev,W,b,activation,keep_prob=1,regularization=None):
    
    Z,linear_cache = linear_forward(A_prev,W,b,keep_prob=keep_prob,regularization=regularization)
    A,activation_cache =  activation_forward(Z,activation)
    cache = (linear_cache,activation_cache)
    
    return A,cache
def L_model_forward(X,pars,keep_prob=1,regularization=None):
    caches = []
    A = X
    L = len(pars)//2 + 1
    np.random.seed(1)
    
    A_prev = A
    A,cache = linear_activation_forward(A_prev,pars['W1'],pars['b1'],activation='relu',keep_prob=1,regularization=None)
    caches.append(cache)
    
    
    for l in range(2,L-1):
        A_prev = A
        A,cache = linear_activation_forward(A_prev,pars['W'+str(l)],pars['b'+str(l)],activation='relu',keep_prob=keep_prob,regularization=regularization)
        caches.append(cache)
        
    AL,cache = linear_activation_forward(A,pars['W'+str(L-1)],pars['b'+str(L-1)],activation='sigmoid',keep_prob=keep_prob,regularization=regularization)
    caches.append(cache)
    assert(AL.shape == (1,X.shape[1]))
#     print ('- '*30)
    
    return AL,caches

2.3、计算cost

def compute_cost(AL,Y,pars,lambd=0,regularization=None):
    assert(AL.shape[1] == Y.shape[1])
    
#     cost = -np.mean(Y*np.log(AL)+(1-Y)*np.log(1-AL),axis=1,keepdims=True) # 数组对应位置相乘,矩阵进行矩阵乘法

    m = Y.shape[1]
#     cost = (1./m) * (-np.dot(Y,np.log(AL).T) - np.dot(1-Y, np.log(1-AL).T)) # 一维数组对应位置求乘积求和一步进行,再求均值
#     cost = np.squeeze(cost)
#     print (AL)

    cost = (1./m) * (-np.multiply(Y,np.log(AL)) - np.multiply(1-Y, np.log(1-AL))) # (数组和矩阵)对应位置成绩再求均值,再求和
    cost = np.nansum(cost)        #     np.nansum,序列里面即使还有空值仍然能够进行计算 

    # this code for L2 regularization 
    if regularization == 'L2':
        l2 = 0
        L = int(len(pars)/2)
        for l in range(1,L+1):
            a = np.sum(np.square(pars['W'+str(l)]))
            l2 +=a
        l2 = l2*lambd/m/2
        cost = cost + l2
     ##############################
    
#  三种乘法*,np.dot, np.multiply
    return cost

2.4、反向传播相关函数

def sigmoid_backrward(dA,activation_cache):
    
    Z = activation_cache
    A = 1./(1 + np.exp(-Z))
    dZ = dA*A*(1-A)
    
    return dZ
def relu_backward(dA,activation_cache):
    
    Z = activation_cache
    dZ = np.array(dA,copy=True)
    assert (dZ.shape == Z.shape)
    dZ[Z <= 0] = 0
    
    return dZ
def activation_backward(dA,activation_cache,activation):
    
    if activation == 'sigmoid':
        dZ = sigmoid_backrward(dA,activation_cache)
    elif activation == 'relu':
        dZ = relu_backward(dA,activation_cache)
        
    return dZ
    
def linear_backward(dZ,linear_cache,lambd=0,regularization=None,keep_prob=1):
    
    A_prev, W, b ,D = linear_cache
    m = A_prev.shape[1]
    dA_prev = np.dot(W.T,dZ)
    
    # this code for dropout
    if regularization == 'dropout':
        assert (dA_prev.shape == D.shape)
        dA_prev = np.multiply(dA_prev,D)
        dA_prev = dA_prev/keep_prob
    ######################################
    
    dW = 1./m*np.dot(dZ,A_prev.T)       #没有除以m,导致计算错误
    
    # this code for regularization
    if regularization == 'L2':
        dW = dW + W*lambd/m
    ######################
    
    db = np.mean(dZ,axis=1,keepdims=True)   #应该使用这种方式,效果应该会更好,层数较少,神经元较少使用此种较好
#     db = 1./m * np.sum(dZ)  #这两种方式计算db结果为什么不一样,之前都是这么计算的啊
    # 与课程结果不一致的原因,db的计算方式不一样。
    return dA_prev,dW,db
def activation_linear_backward(dA,cache,activation,lambd=0,regularization=None,keep_prob=1):
    
    linear_cache,activation_cache = cache
    
    dZ = activation_backward(dA,activation_cache,activation)
    dA_prev,dW,db = linear_backward(dZ,linear_cache,lambd=lambd,regularization=regularization,keep_prob=keep_prob)

    return dA_prev,dW,db
def L_model_backward(AL,Y,caches,lambd=0,regularization=None,keep_prob=1):
    
    Y = Y.reshape(AL.shape)
    dAL = -(np.divide(Y,AL) - np.divide(1-Y,1-AL))
    grads = {}
    L = len(caches) + 1
    current_cache = caches[L-2]
    
    grads['dA'+str(L-1)],grads['dW'+str(L-1)],grads['db'+str(L-1)] = activation_linear_backward(dAL,current_cache,activation='sigmoid',lambd=lambd,regularization=regularization,keep_prob=keep_prob)
    for l in reversed(range(L-2)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = activation_linear_backward(grads['dA'+str(l+2)],current_cache,activation='relu',lambd=lambd,regularization=regularization,keep_prob=keep_prob)
        grads["dA" + str(l + 1)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
    
    return grads

2.5、梯度检查

def dictionary_to_vector(parameters,flag='pars'):
    """
    Roll all our parameters dictionary into a single vector satisfying our specific required shape.
    """
    shapes = []
    count = 0
    
    if flag == 'pars':
        L = int(len(parameters)/2)
        keylist = []
        for l in range(1,L+1):
            keylist.append('W'+str(l))
            keylist.append('b'+str(l))
    elif flag == 'grads':
        L = int(len(parameters)/3)
        keylist = []
        for l in range(1,L+1):
            keylist.append('dW'+str(l))
            keylist.append('db'+str(l))
            
    for key in keylist:
        new_vector = np.reshape(parameters[key], (-1,1))
        pars_shape = parameters[key].shape
        shapes.append(pars_shape)

        if count == 0:
            theta = new_vector
        else:
            theta = np.concatenate((theta, new_vector), axis=0)
        count = count + 1        

    return theta, shapes

def vector_to_dictionary(theta,shapes,pre_pars):
    """
    Unroll all our parameters dictionary from a single vector satisfying our specific required shape.
    """
    parameters = {}
    L = int(len(pre_pars)/2)
    i = 0
    for l in range(1,L+1):
        a = shapes[2*(l-1)]
        parameters['W'+str(l)] = theta[i:i+a[0]*a[1]].reshape(a)
        i +=a[0]*a[1]
        
        a = shapes[2*l-1]
        parameters['b'+str(l)] = theta[i:i+a[0]*a[1]].reshape(a)
        i +=a[0]*a[1]

    return parameters
# test dictionary_to_vector and vector_to_dictionary function
_, _, parameters_test = gradient_check_n_test_case()
# print (parameters)
theta_test,shapes_test = dictionary_to_vector(parameters_test)
# print (theta_test)
pars_test = vector_to_dictionary(theta_test,shapes_test,parameters_test)
print (pars_test)
# GRADED FUNCTION: gradient_check_n

def gradient_check_n(parameters, gradients, X, Y, epsilon = 1e-7):
    """
    Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n
    
    Arguments:
    parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
    grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters. 
    x -- input datapoint, of shape (input size, 1)
    y -- true "label"
    epsilon -- tiny shift to the input to compute approximated gradient with formula(1)
    
    Returns:
    difference -- difference (2) between the approximated gradient and the backward propagation gradient
    """
    
    # Set-up variables
    parameters_values, parmeters_shapes = dictionary_to_vector(parameters)
    grad,_ = dictionary_to_vector(gradients,flag='grads')
    num_parameters = parameters_values.shape[0]
    J_plus = np.zeros((num_parameters, 1))
    J_minus = np.zeros((num_parameters, 1))
    gradapprox = np.zeros((num_parameters, 1))
    
    # Compute gradapprox
    for i in range(num_parameters):
        
        # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]".
        # "_" is used because the function you have to outputs two parameters but we only care about the first one
        ### START CODE HERE ### (approx. 3 lines)
        thetaplus = np.copy(parameters_values)                                    # Step 1
        thetaplus[i][0] = thetaplus[i][0] + epsilon                               # Step 2

        pars = vector_to_dictionary(thetaplus,parmeters_shapes,parameters)
        AL_plus,caches_plus = L_model_forward(X,pars)
        J_plus[i] = compute_cost(AL_plus,Y,pars,lambd=0,regularization=None)
    
        ### END CODE HERE ###
        
        # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]".
        ### START CODE HERE ### (approx. 3 lines)
        thetaminus = np.copy(parameters_values)                                   # Step 1
        thetaminus[i][0] = thetaminus[i][0] - epsilon                           # Step 2        
        
        pars = vector_to_dictionary(thetaminus,parmeters_shapes,parameters)
        AL_mins,caches_minus = L_model_forward(X,pars)
        J_minus[i] = compute_cost(AL_mins,Y,pars,lambd=0,regularization=None)
        ### END CODE HERE ###
        
        # Compute gradapprox[i]
        ### START CODE HERE ### (approx. 1 line)
        gradapprox[i] = (J_plus[i] - J_minus[i])/2/epsilon
        ### END CODE HERE ###

    # Compare gradapprox to backward propagation gradients by computing difference.
    ### START CODE HERE ### (approx. 1 line)
    numerator = np.linalg.norm(gradapprox - grad)                                          # Step 1'
    denominator = np.linalg.norm(gradapprox) + np.linalg.norm(grad)                                         # Step 2'
    difference = numerator/denominator                                # Step 3'
    ### END CODE HERE ###

    if difference > 1e-7:
        print ("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
    else:
        print ("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")
    
    return difference
X, Y, parameters = gradient_check_n_test_case()
Y = Y.reshape(1,-1)

AL,caches = L_model_forward(X,parameters)
grads = L_model_backward(AL,Y,caches,lambd=0,regularization=None)
difference = gradient_check_n(parameters, grads, X, Y)

2.6、参数更新

def update_parameters(pars,grads,learning_rate):
    
    L = len(pars)//2 + 1
    for l in range(1,L):
        pars['W'+str(l)] = pars['W'+str(l)] - learning_rate*grads['dW'+str(l)]
        pars['b'+str(l)] = pars['b'+str(l)] - learning_rate*grads['db'+str(l)]
    
    return pars

L_layer_model

def L_layer_model(X,Y,layer_dims,learning_rate = 0.01,num_iterations = 3000,print_cost=False,initialization='he',lambd=0,regularization=None,keep_prob = 1):
    
    '''
    1、初始化参数
    2、根据迭代次数循环
        3、正向传播
        4、计算cost
        5、反向传播
        6、更新参数
    7、输出costs和pars
    '''
#     np.random.seed(1)
    
    #初始化参数
    pars = initialize_parameters(layer_dims,initialization)

    L = len(layer_dims)
    costs = []
    for i in range(0,num_iterations):
        
        #正向传播
        AL,caches = L_model_forward(X,pars)

        #计算cost
        cost = compute_cost(AL,Y,pars,lambd=lambd,regularization=regularization) 

        if i%1000 ==0 :
            costs.append(cost)
        if i%10000 ==0 and print_cost:
            print("Cost after iteration %i: %f" %(i, cost))

        #反向传播
        grads = L_model_backward(AL,Y,caches,lambd=lambd,regularization=regularization)
    
        #更新参数
        pars = update_parameters(pars,grads,learning_rate)
        
    plt.figure
    plt.figure(figsize = (30,6.5))
    plt.subplot(1,2,1)
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per thousands)')
    plt.title("Learning rate =" + str(learning_rate))

    return costs,pars
layers_dims = [2,20,3,1] #  3-layer model
# no regularization 
costs_test,pars_test = L_layer_model(train_X, train_Y, layers_dims,learning_rate = 0.3, num_iterations = 30000, print_cost = True,initialization='he')

# # L2 regularization
# costs_test,pars_test = L_layer_model(train_X, train_Y, layers_dims,learning_rate = 0.3, num_iterations = 30000, print_cost = True,initialization='he',lambd=0.7,regularization='L2')

# # dropout
# costs_test,pars_test = L_layer_model(train_X, train_Y, layers_dims,learning_rate = 0.3, num_iterations = 30000, print_cost = True,initialization='he',lambd=0,regularization='dropout',keep_prob = 0.86)

3、预测

在指定learning_rate和num_iterations的情况下得到最优参数,再根据最优参数进行预测

def predict(X, y, parameters):
    """
    This function is used to predict the results of a  L-layer neural network.
    
    Arguments:
    X -- data set of examples you would like to label
    parameters -- parameters of the trained model
    
    Returns:
    p -- predictions for the given dataset X
    """
    
    m = X.shape[1]
    n = len(parameters) // 2 # number of layers in the neural network
    p = np.zeros((1,m))
    
    # Forward propagation
    probas, caches = L_model_forward(X, parameters)

    
    # convert probas to 0/1 predictions
    for i in range(0, probas.shape[1]):
        if probas[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0
    
    #print results
    #print ("predictions: " + str(p))
    #print ("true labels: " + str(y))
    print("Accuracy: "  + str(np.sum((p == y)/m)))
        
    return p
pred_train = predict(train_X, train_Y, pars_test)
pred_test = predict(test_X, test_Y, pars_test)
def plot_decision_boundary(model, X, y):
    # Set min and max values and give it some padding
    x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
    y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole grid
    Z = model(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.ylabel('x2')
    plt.xlabel('x1')
    plt.scatter(X[0, :], X[1, :], c=np.squeeze(y), cmap=plt.cm.Spectral)
    plt.show()
def predict_dec(parameters, X):
    """
    Used for plotting decision boundary.
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    X -- input data of size (m, K)
    
    Returns
    predictions -- vector of predictions of our model (red: 0 / blue: 1)
    """
    
    # Predict using forward propagation and a classification threshold of 0.5
    a3, cache = L_model_forward(X, parameters)
    predictions = (a3>0.5)
    return predictions
# plt.title("Model with He initialization")
axes = plt.gca()
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
plot_decision_boundary(lambda x: predict_dec(pars_test, x.T), train_X, np.squeeze(train_Y))

有需要全套作业练习notebook及资料的可以加我微信yuhaidong112

猜你喜欢

转载自blog.csdn.net/weixin_42432468/article/details/84000502