机器学习实战---Logistic回归

from numpy import *

def loadDataSet(): #讲文本中的数据解析成矩阵
	dataMat=[];labelMat=[]
	fr=open('testSet.txt')
	for line in fr.readlines():
		lineArr = line.strip().split() #每个数据变成单个的字符串,存放在lineArr列表里
		dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #dataMat列表的每个元素是列表
		labelMat.append(int(lineArr[2]))
	return dataMat,labelMat
	
def sigmoid(inX):
	return 1.0/(1+exp(-inX))
	
def gradAscent(dataMatIn,classLabels):
	dataMatrix = mat(dataMatIn) #转化成numpy中的matrix结构
	labelMat = mat(classLabels).transpose() #转置成n*1的矩阵
	m,n=shape(dataMatrix)
	alpha = 0.001 #设置步长
	maxCycles = 500 #设置最大迭代次数
	weights = ones((n,1)) #n行1列的array数组,这是权重系数的初始值
	for k in range(maxCycles):
		h = sigmoid(dataMatrix*weights) #h是一个列向量
		error=(labelMat-h)
		weights = weights + alpha * dataMatrix.transpose()* error #matrix mult
	return weights
	
def plotBestFit(weights):
	import matplotlib.pylob as plt
	dataMat,labelMat = loadDataSet()
	dataArr = array(dataMat) #将列表变成array数组
	n = shape(dataArr)[0] #shape返回的是一个元祖,n代表样本数
	xcord1=[];ycord1=[] #正类
	xcord2=[];ycord2=[]	#负类
	for i in range(n):
		if int(labelMat[i]==1):
			xcord1.append(dataArr[i,1]) #第i+1行,第2列的元素(代表第一个特征)
			ycord1.append(dataArr[i,2])	#第i+1行,第3列的元素(代表第二个特征的值)
		else:
			xcord2.append(dataArr[i,1]) 
			ycord2.append(dataArr[i,2])
	
	fig = plt.figure() #生成一块新的画布
	ax = fig.add_subplot(111) #在画布fig中生成一个ax的画图区
	ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') #画散点图
	ax.scatter(xcord2, ycord2, s=30, c='green')#参数s是size,参数maker是形状
	#到此,样本点的散点图已经画好,下面画分类直线
	x=arange(-3.0, 3.0, 0.1)
	y=(-weights[0]-weights[1]*x)/weights[2]
	# w0+w1*x+w2*y=0  => y = (-w0-w1*x)/w2
	ax.plot(x,y) #在ax的画图区内画直线
	plt.xlabel('X1') ; plt.ylabel('X2')
	plt.show()
	
def stocGradAscent0(dataMatrix,classLabels):
	m,n=shape(dataMatrix)
	alpha=0.01
	weights = ones(n) #1行n列
	for i in range(m): #m个样本
		h=sigmoid(sum(dataMatrix[i]*weights)) #每次只计算一个样本,则h就是一个数字,非向量
		error=classLabels[i]-h
		weights = weights+alpha*error*dataMatrix[i] #dataMatrix的结构为numpy数组
	return weights
	
def stocGradAscent1(dataMatrix,classLabels,numIter=150):
	m,n=shape(dataMatrix)
	weights = ones(n)
	for j in range(numIter): #在整个数据集上的迭代
        dataIndex = range(m)
        for i in range(m): #对于每一个样本
			alpha = 4/(1.0+j+i)+0.01  #设置一个函数,步长随着迭代次数的进行,会慢慢减小
			randIndex = int(random.uniform(0,len(dataIndex))) #对于m个样本,不是依次更新,而是每次随机选取一个样本进行更新迭代操作
			h=sigmoid(sum(dataMatrix[randIndex]*weights))
			error = classLabels[randIndex] - h
            weights = weights + alpha * error * dataMatrix[randIndex]
            del(dataIndex[randIndex]) #将用来更新过权重的样本从原样本中去掉,下一次再随机选取一个新的样本用来参与迭代操作
	return weights
	
def classifyVector(inX,weights): #输入是inX,和已求出的回归系数weights
	prob = sigmoid(sum(inX*weights))
	if prob > 0.5:	return 1.0
	else:			return 0.0

def colicTest():#对数据进行格式化处理,同时测试算法的准确率
	frTrain = open('horseColicTraining.txt')
	frTest = open('horseColicTest.txt')
	trainingSet=[];	trainingLabels=[]
	for line in frTrain.readlines():
		currLine = line.strip().split('\t')#分解成单个的字符串列表
		lineArr=[]
		for i in range(21): #有21个特征
			lineArr.append(float(currLine[i])) #lineArr是一个浮点型的列表
		trainingSet.append(lineArr)
		trainingLabels.append(float(currLine[21]))
	trainWeights = stocGradAscent1(array(trainingSet),trainingLabels,500): #整个数据集上迭代500次
	errorCount = 0.0;	numTestVec = 0.0
	for line in frTest.readlines():
		numTestVec += 1.0
		currLine = line.strip().split('\t')
		lineArr = []
		for i in range(21):
			lineArr.append(float(currLine[i]))
		if int(classifyVector(array(lineArr),trainWeights))!= int(currLine[21]):
			errorCount += 1
	errorRate = float(errorCount)/numTestVec
	print "the error rate of this test is: %f"  % errorRate
	return errorRate
		
def	multiTest(): #调用colicTest函数10次,并计算错误率的平均值
	numTests = 10; errorSum=0.0
    for k in range(numTests):
        errorSum += colicTest()
    print "after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests))

猜你喜欢

转载自blog.csdn.net/carl95271/article/details/80765916