The necessary mathematical concepts
1, discrete random variables and distribution
1.1: 0-1 distribution (distribution points) (Bernoulli) : When , the distribution function , desirably E = 1 * p + 0 * (1-p) = p, variance: D = p ( 1-p)
1.2: binomial distribution (n Bernoulli weight distribution) : the distribution function: expected E = np, variance: D = np (1-p )
1.3: Poisson distribution: the distribution function is desired E = λ, the variance D = λ
2, the maximum likelihood estimate (Likelihood) : by selecting the parameters so that the maximum probability of the known data occurs
Conditional probability P (X | [theta] ) [theta] under the conditions shown in the probability of occurrence X, is the corresponding likelihood function: L ( [theta] | X ) are known in the parameter data X [theta] probability function of the value It is equal to the P (X-| [theta] ).
Several common linear regression
① linear regression; F (Xi) + b = WXi, wherein Xi is characterized, W and b are parameters to be determined, is to obtain the best model training and W b
Linear regression of the loss function : W and b are used to measure whether the best use of mean square error. Mean square error J ([theta]) = [Sigma [F (Xi) -Y] ^ 2 . The goal is to minimize the error by the deflector to seek, 0 for the optimal solution, find the minimum error process is referred to as the least square deflector.
Using the partial derivative is obtained as the optimal solution:
Least Squares: square of the difference between the estimated value and the actual value minimum. Highlighting the optimization features.
② multiple linear regression , W is a vector , is also the optimal solution can be obtained by least squares multiple linear model so as to obtain final
③ logarithmic linear regression :
④logistic regression (logarithmic regression probability) , based on the sigmoid function .
logistic regression loss function:
Maximum Likelihood: by selecting the known data parameter maximum probability of occurrence in some sense. (Need to know the probability distribution function of the known data)
It is such originally , in order to simplify the calculation taking the logarithm:
Gradient ascent algorithm : , [alpha] is the learning rate, divided by the partial derivative X was later gradient.
Gradient ascent algorithm optimization logisitc loss function: using this formula iterative solution obtained (partial derivative compound function J ( [theta]) -> H ( [theta] ) = G ( [theta] T ) -> [theta] T -> [theta] )
So you can get θj heavy weights array
Code section
Gradient ascent algorithm to achieve the code:
def sigmoid(inX): #定义好sigmoid函数 return 1.0 / (1 + np.exp(-inX)) def gradAscent(dataMatIn, classLabels): dataMatrix = np.mat(dataMatIn) # 转换成numpy的mat labelMat = np.mat(classLabels).transpose() # 转换成numpy的mat,并进行转置 m, n = np.shape(dataMatrix) # 返回dataMatrix的大小。m为行数,n为列数。 alpha = 0.001 # 移动步长,也就是学习速率,α maxCycles = 500 # 最大迭代次数,公式中的m weights = np.ones((n, 1)) #用于建立线性方程的权重值,我们使用这个算法的目的是获得Wi这些权重值,公式中的表示为θj for k in range(maxCycles): h = sigmoid(dataMatrix * weights) # 梯度上升矢量化公式 error = labelMat - h #上方推导的最后一个公式的y-hθ(x) weights = weights + alpha * dataMatrix.transpose() * error return weights.getA() # 将矩阵转换为数组,返回权重数组
决策边界
fig = plt.figure() ax = fig.add_subplot(111) x = np.arange(-3.0, 3.0, 0.1) y = (-weights[0] - weights[1] * x) / weights[2] #这个权重函数为毛这样设定???? ax.plot(x, y) #绘制决策边界 plt.title('BestFit') #绘制title plt.xlabel('X1'); plt.ylabel('X2') #绘制坐标轴 plt.show()
分类试验
# -*- coding:UTF-8 -*- import matplotlib.pyplot as plt import numpy as np def loadDataSet(): dataMat = [] #创建数据列表 labelMat = [] #创建标签列表 fr = open('testSet.txt') #打开文件 for line in fr.readlines(): #逐行读取 lineArr = line.strip().split() #去回车,放入列表 dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #添加数据 labelMat.append(int(lineArr[2])) #添加标签 fr.close() #关闭文件 return dataMat, labelMat #返回 def sigmoid(inX): return 1.0 / (1 + np.exp(-inX)) def gradAscent(dataMatIn, classLabels): dataMatrix = np.mat(dataMatIn) #转换成numpy的mat labelMat = np.mat(classLabels).transpose() #转换成numpy的mat,并进行转置 m, n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 alpha = 0.001 #移动步长,也就是学习速率,控制更新的幅度。 maxCycles = 500 #最大迭代次数 weights = np.ones((n,1)) for k in range(maxCycles): h = sigmoid(dataMatrix * weights) #梯度上升矢量化公式 error = labelMat - h weights = weights + alpha * dataMatrix.transpose() * error return weights.getA() #将矩阵转换为数组,返回权重数组 def plotBestFit(weights): dataMat, labelMat = loadDataSet() #加载数据集 dataArr = np.array(dataMat) #转换成numpy的array数组 n = np.shape(dataMat)[0] #数据个数 xcord1 = []; ycord1 = [] #正样本 xcord2 = []; ycord2 = [] #负样本 for i in range(n): #根据数据集标签进行分类 if int(labelMat[i]) == 1: xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) #1为正样本 else: xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) #0为负样本 fig = plt.figure() ax = fig.add_subplot(111) #添加subplot ax.scatter(xcord1, ycord1, s = 20, c = 'red', marker = 's',alpha=.5)#绘制正样本 ax.scatter(xcord2, ycord2, s = 20, c = 'green',alpha=.5) #绘制负样本 x = np.arange(-3.0, 3.0, 0.1) y = (-weights[0] - weights[1] * x) / weights[2] ax.plot(x, y) plt.title('BestFit') #绘制title plt.xlabel('X1'); plt.ylabel('X2') #绘制label plt.show() if __name__ == '__main__': dataMat, labelMat = loadDataSet() weights = gradAscent(dataMat, labelMat) plotBestFit(weights)
随机梯度上升算法(改进版)
改进一:学习率不断改变(变小)
改进二:随机选择样本,用完之后删除随机选择的样本,可以减少计算量def stocGradAscent1(dataMatrix, classLabels, numIter=150):
def stocGradAscent1(dataMatrix, classLabels, numIter=150): m,n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 weights = np.ones(n) #参数初始化 for j in range(numIter): dataIndex = list(range(m)) for i in range(m): alpha = 4/(1.0+j+i)+0.01 #降低alpha的大小,每次减小1/(j+i)。 randIndex = int(random.uniform(0,len(dataIndex))) #随机选取样本 h = sigmoid(sum(dataMatrix[randIndex]*weights)) #选择随机选取的一个样本,计算h error = classLabels[randIndex] - h #计算误差 weights = weights + alpha * error * dataMatrix[randIndex] #更新回归系数 del(dataIndex[randIndex]) #删除已经使用的样本 return weights
改进前后的梯度上升算法对比
# -*- coding:UTF-8 -*- from matplotlib.font_manager import FontProperties import matplotlib.pyplot as plt import numpy as np import random def loadDataSet(): dataMat = [] # 创建数据列表 labelMat = [] # 创建标签列表 fr = open('testSet.txt') # 打开文件 for line in fr.readlines(): # 逐行读取 lineArr = line.strip().split() # 去回车,放入列表 dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) # 添加数据 labelMat.append(int(lineArr[2])) # 添加标签 fr.close() # 关闭文件 return dataMat, labelMat # 返回 def sigmoid(inX): return 1.0 / (1 + np.exp(-inX)) def gradAscent(dataMatIn, classLabels): dataMatrix = np.mat(dataMatIn) # 转换成numpy的mat labelMat = np.mat(classLabels).transpose() # 转换成numpy的mat,并进行转置 m, n = np.shape(dataMatrix) # 返回dataMatrix的大小。m为行数,n为列数。 alpha = 0.01 # 移动步长,也就是学习速率,控制更新的幅度。 maxCycles = 500 # 最大迭代次数 weights = np.ones((n, 1)) weights_array = np.array([]) for k in range(maxCycles): h = sigmoid(dataMatrix * weights) # 梯度上升矢量化公式 error = labelMat - h weights = weights + alpha * dataMatrix.transpose() * error weights_array = np.append(weights_array, weights) weights_array = weights_array.reshape(maxCycles, n) return weights.getA(), weights_array # 将矩阵转换为数组,并返回 def stocGradAscent1(dataMatrix, classLabels, numIter=150): m, n = np.shape(dataMatrix) # 返回dataMatrix的大小。m为行数,n为列数。 weights = np.ones(n) # 参数初始化 weights_array = np.array([]) # 存储每次更新的回归系数 for j in range(numIter): dataIndex = list(range(m)) for i in range(m): alpha = 4 / (1.0 + j + i) + 0.01 # 降低alpha的大小,每次减小1/(j+i)。 randIndex = int(random.uniform(0, len(dataIndex))) # 随机选取样本 h = sigmoid(sum(dataMatrix[randIndex] * weights)) # 选择随机选取的一个样本,计算h error = classLabels[randIndex] - h # 计算误差 weights = weights + alpha * error * dataMatrix[randIndex] # 更新回归系数 weights_array = np.append(weights_array, weights, axis=0) # 添加回归系数到数组中 del (dataIndex[randIndex]) # 删除已经使用的样本 weights_array = weights_array.reshape(numIter * m, n) # 改变维度 return weights, weights_array # 返回 def plotWeights(weights_array1, weights_array2): # 设置汉字格式 font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) # 将fig画布分隔成1行1列,不共享x轴和y轴,fig画布的大小为(13,8) # 当nrow=3,nclos=2时,代表fig画布被分为六个区域,axs[0][0]表示第一行第一列 fig, axs = plt.subplots(nrows=3, ncols=2, sharex=False, sharey=False, figsize=(20, 10)) x1 = np.arange(0, len(weights_array1), 1) # 绘制w0与迭代次数的关系 axs[0][0].plot(x1, weights_array1[:, 0]) axs0_title_text = axs[0][0].set_title(u'梯度上升算法:回归系数与迭代次数关系', FontProperties=font) axs0_ylabel_text = axs[0][0].set_ylabel(u'W0', FontProperties=font) plt.setp(axs0_title_text, size=20, weight='bold', color='black') plt.setp(axs0_ylabel_text, size=20, weight='bold', color='black') # 绘制w1与迭代次数的关系 axs[1][0].plot(x1, weights_array1[:, 1]) axs1_ylabel_text = axs[1][0].set_ylabel(u'W1', FontProperties=font) plt.setp(axs1_ylabel_text, size=20, weight='bold', color='black') # 绘制w2与迭代次数的关系 axs[2][0].plot(x1, weights_array1[:, 2]) axs2_xlabel_text = axs[2][0].set_xlabel(u'迭代次数', FontProperties=font) axs2_ylabel_text = axs[2][0].set_ylabel(u'W2', FontProperties=font) plt.setp(axs2_xlabel_text, size=20, weight='bold', color='black') plt.setp(axs2_ylabel_text, size=20, weight='bold', color='black') x2 = np.arange(0, len(weights_array2), 1) # 绘制w0与迭代次数的关系 axs[0][1].plot(x2, weights_array2[:, 0]) axs0_title_text = axs[0][1].set_title(u'改进的随机梯度上升算法:回归系数与迭代次数关系', FontProperties=font) axs0_ylabel_text = axs[0][1].set_ylabel(u'W0', FontProperties=font) plt.setp(axs0_title_text, size=20, weight='bold', color='black') plt.setp(axs0_ylabel_text, size=20, weight='bold', color='black') # 绘制w1与迭代次数的关系 axs[1][1].plot(x2, weights_array2[:, 1]) axs1_ylabel_text = axs[1][1].set_ylabel(u'W1', FontProperties=font) plt.setp(axs1_ylabel_text, size=20, weight='bold', color='black') # 绘制w2与迭代次数的关系 axs[2][1].plot(x2, weights_array2[:, 2]) axs2_xlabel_text = axs[2][1].set_xlabel(u'迭代次数', FontProperties=font) axs2_ylabel_text = axs[2][1].set_ylabel(u'W1', FontProperties=font) plt.setp(axs2_xlabel_text, size=20, weight='bold', color='black') plt.setp(axs2_ylabel_text, size=20, weight='bold', color='black') plt.show() if __name__ == '__main__': dataMat, labelMat = loadDataSet() weights1, weights_array1 = stocGradAscent1(np.array(dataMat), labelMat) weights2, weights_array2 = gradAscent(dataMat, labelMat) plotWeights(weights_array1, weights_array2)
logistic回归:用算法求得权值参数--> 权值参数与特征相乘求和 --> 根据这个和是否大于0.5 分类
# -*- coding:UTF-8 -*- import numpy as np import random def sigmoid(inX): return 1.0 / (1 + np.exp(-inX)) def stocGradAscent1(dataMatrix, classLabels, numIter=150): m, n = np.shape(dataMatrix) # 返回dataMatrix的大小。m为行数,n为列数。 weights = np.ones(n) # 参数初始化 #存储每次更新的回归系数 for j in range(numIter): dataIndex = list(range(m)) for i in range(m): alpha = 4 / (1.0 + j + i) + 0.01 # 降低alpha的大小,每次减小1/(j+i)。 randIndex = int(random.uniform(0, len(dataIndex))) # 随机选取样本 h = sigmoid(sum(dataMatrix[randIndex] * weights)) # 选择随机选取的一个样本,计算h error = classLabels[randIndex] - h # 计算误差 weights = weights + alpha * error * dataMatrix[randIndex] # 更新回归系数 del (dataIndex[randIndex]) # 删除已经使用的样本 return weights # 返回 def colicTest(): frTrain = open('horseColicTraining.txt') # 打开训练集 frTest = open('horseColicTest.txt') # 打开测试集 trainingSet = []; trainingLabels = [] for line in frTrain.readlines(): currLine = line.strip().split('\t') lineArr = [] for i in range(len(currLine) - 1): lineArr.append(float(currLine[i])) trainingSet.append(lineArr) trainingLabels.append(float(currLine[-1])) trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 500) # 使用改进的随即上升梯度训练获得权值参数数组 errorCount = 0; numTestVec = 0.0 for line in frTest.readlines(): numTestVec += 1.0 currLine = line.strip().split('\t') lineArr = [] for i in range(len(currLine) - 1): lineArr.append(float(currLine[i])) if int(classifyVector(np.array(lineArr), trainWeights)) != int(currLine[-1]): #分类器结果与标签值比较判别正确情况 errorCount += 1 errorRate = (float(errorCount) / numTestVec) * 100 # 错误率计算 print("测试集错误率为: %.2f%%" % errorRate) def classifyVector(inX, weights): #利用sigmoid函数特点定义分类 prob = sigmoid(sum(inX * weights)) if prob > 0.5: return 1.0 else: return 0.0 if __name__ == '__main__': colicTest()