西瓜书 课后习题3.4 十折交叉和留一法,对率回归

import csv
import numpy as np


def readData(filename):
    """ 
    :param filename:cvs数据文件
    :return: X1,y1,X2,y2,X3,y3
            X: list with shape[50,4],特征   //更新:此处应该为[x;1],为shape[50,5],后面对应修改
            y: list with shape[50,],标签
    """
    X1, X2, X3 = [], [], []
    y1, y2, y3 = [], [], []
    # 读数据
    with open(filename, 'r') as f:
        reader = csv.reader(f)
        for line in reader:
            if line[4] == 'Iris-setosa':
                X1.append([float(line[0]), float(line[1]), float(line[2]), float(line[3]), 1.0])
                y1.append(line[4])
            elif line[4] == 'Iris-versicolor':
                X2.append([float(line[0]), float(line[1]), float(line[2]), float(line[3]), 1.0])
                y2.append(line[4])
            else:
                X3.append([float(line[0]), float(line[1]), float(line[2]), float(line[3]), 1.0])
                y3.append(line[4])
    return X1, X2, X3, y1, y2, y3


def tenfolddata(X1, X2):
    """
    产生十折训练数据,每折5个正例,5个反例
    :param X1: list with shape[50,4], positive shape[50,5]
    :param X2: list with shape[50,4], negative shape[50,5]
    :return: folds: list with shape[10,10,4]  shape[10,10,5]
             y; list with shape[10,10]
    """
    folds = []
    y = []
    for i in range(10):
        fold = []
        fold += X1[i * 5: (i + 1) * 5]
        fold += X2[i * 5: (i + 1) * 5]
        folds.append(fold)
        y.append([1] * 5 + [0] * 5)
    return folds, y


def LR(X, y):
    """
    训练逻辑回归模型,梯度递降法
    :param X: np.array with shape[N,d], Input  包括111...
    :param y: np.array with shape[N,1], label
    :return: beta with shape[1,d],包括b   Optimal params with gradDescent method
    """
    N, d = X.shape
    lr = 0.01             ############  对结果影响很大
    beta = np.ones((1, d)) * 0.1
    z = X.dot(beta.T)  # [N,1]

    for i in range(150):
        p1 = np.exp(z) / (1 + np.exp(z))  # shape[N,1]
        first_order = -np.sum(X * (y - p1), 0, keepdims=True)  # shape[1,d]

        # update
        beta -= first_order * lr
        z = X.dot(beta.T)
    l = np.sum(-y * z + np.log(1 + np.exp(z)))
    return beta


def testing(beta, X, y):
    """
    基于逻辑回归进行分类任务测试
    :param beta: np.array with shape[1,d], 逻辑回归参数
    :param X: np.array wiht shape[N,d], testing instances
    :param y: np.array with shape[N,1], testing labels
    :return: error_num, LR算法分类错误个数
    """
    predicts = (X.dot(beta.T) >= 0)  # shape[N,1]
    error_num = np.sum(predicts != y)
    return error_num


def tenFoldCrossValidation(folds, y):
    """
    十折交叉验证
    :param folds: list with shape[10,10,5]
    :param y: list with shape[10,10]
    :return:ten_fold_error_nums
    """
    ten_fold_error_nums = 0
    for i in range(10):
        train_X = folds[:i] + folds[i + 1:]
        train_y = y[:i] + y[i + 1:]
        val_X = folds[i]
        val_y = y[i]
        train_X = np.array(train_X).reshape(-1, 5)  # -1指的是在不知道有多少行的情况下直接进行划分,最终为shape[n,4]
        train_y = np.array(train_y).reshape([-1, 1])
        val_X = np.array(val_X).reshape(-1, 5)
        val_y = np.array(val_y).reshape([-1, 1])

        beta = LR(train_X, train_y)
        error_num = testing(beta, val_X, val_y)
        ten_fold_error_nums += error_num
    return ten_fold_error_nums


def Loo(X, y):
    """
    留一法进行预测
    :param X: list with shape[100,4]
    :param y: list with shape[100]
    :return: Loo_error_nums
    """
    loo_error_nums = 0
    for i in range(100):
        train_X = X[:i] + X[i + 1:]
        train_y = y[:i] + y[i + 1:]
        val_X = X[i]
        val_y = y[i]
        train_X = np.array(train_X).reshape(-1, 5)
        train_y = np.array(train_y).reshape(-1, 1)
        val_X = np.array(val_X).reshape(-1, 5)
        val_y = np.array(val_y).reshape(-1, 1)

        beta = LR(train_X, train_y)
        error_num = testing(beta, val_X, val_y)
        loo_error_nums += error_num
    return loo_error_nums


if __name__ == '__main__':
    dataset = 'C:\\Users\\14399\\Desktop\\iris.csv'
    X1, X2, X3, y1, y2, y3 = readData(dataset)
    # 十折交叉验证
    # X1 and X2
    folds, y = tenfolddata(X1, X2)
    # print(folds)
    round1_ten_fold_error_nums = tenFoldCrossValidation(folds, y)
    print(round1_ten_fold_error_nums)
    # X1 and X3
    folds, y = tenfolddata(X1, X3)
    round2_ten_fold_error_nums = tenFoldCrossValidation(folds, y)
    print(round2_ten_fold_error_nums)
    # X2 and X3
    folds, y = tenfolddata(X2, X3)
    round3_ten_fold_error_nums = tenFoldCrossValidation(folds, y)
    print(round3_ten_fold_error_nums)

    # 留一法
    # X1 and X2
    X = X1 + X2
    y = [1] * len(X1) + [0] * len(X2)
    round1_Loo_error_nums = Loo(X, y)
    print(round1_Loo_error_nums)
    # X1 and X3
    X = X1 + X3
    y = [1] * len(X1) + [0] * len(X3)
    round2_Loo_error_nums = Loo(X, y)
    print(round2_Loo_error_nums)
    # X2 and X3
    X = X2 + X3
    y = [1] * len(X2) + [0] * len(X3)
    round3_Loo_error_nums = Loo(X, y)
    print(round3_Loo_error_nums)

结果:十折交叉: 0       0       15           ////对X进行拓展后的结果更好了,分别为: 0 0 3  和  0 0 4
               留一法: 0       0       11

数据集:UCI  iris数据集  

链接:https://pan.baidu.com/s/1CWMvPZdsYsKYncJsl0P5bQ  提取码:lx4r

参考:https://blog.csdn.net/VictoriaW/article/details/77989486

猜你喜欢

转载自blog.csdn.net/weixin_41056428/article/details/83064240