步骤:
1、收集数据
2、准备数据
3、分析数据
4、训练算法
5、测试算法
6、使用算法
1、本文使用的数据是海伦收集的约会数据,可以从 https://download.csdn.net/download/zuyuhuo6777/10627552下载。(datingTestSet2.txt)
详细代码如下并附有详细解释:
#准备数据:从文本文件中解析数据
from numpy import *
import operator
import matplotlib
import matplotlib.pyplot as plt
#from KNN import classify0
import KNN
def file2matrix(filename):#读文件
fr=open(filename)#打开一个文件
arrayOLines=fr.readlines() #按照行读取
numberOFLines=len(arrayOLines)
returnMat=zeros((numberOFLines,3))
classLabelVector=[]
index=0
for line in arrayOLines:
line=line.strip() #裁掉回车字符
listFromLine=line.split('\t')#元素列表
returnMat[index,:]=listFromLine[0:3] #选取前三个元素放于列表
classLabelVector.append(int(listFromLine[-1]))#-1表示最后一列
index+=1
return returnMat,classLabelVector
def autoNorm(dataSet): #归一化特征值
minVals=dataSet.min(0) #从列中选取最小值
maxVals=dataSet.max(0) #从列中选取最大值
ranges=maxVals-minVals #得到范围
normDataset=zeros(shape(dataSet))
m=dataSet.shape[0]#读列,看行数
normDataset=dataSet-tile(minVals,(m,1))
normDataset=normDataset/tile(ranges,(m,1))
return normDataset, ranges, minVals
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def datingClassTest():#取数据的10%进行测试
hoRatio=0.05
datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
normMat,ranges,minVals=autoNorm(datingDataMat)
m=normMat.shape[0]#行数
numTestVecs=int(m*hoRatio)#选取10%数据用来测试
errorCount=0.0
for i in range(numTestVecs):
classifierResult=KNN.classify0(normMat[i,:],normMat[numTestVecs:m,:],
datingLabels[numTestVecs:m],3)
print("the classifer came back with :%d,the real answer is :%d"
%(classifierResult,datingLabels[i]))
if (classifierResult!=datingLabels[i]): errorCount+=1
print("the total error rate is :%f" %(errorCount/float(numTestVecs)))
def classifyPerson():#新数据测试
resultList=['not at all','in small doses','in large doses']
percentTats=float(input("percentage of time spent playing video games?"))
ffMiles=float(input("frequent filter miles earned per year"))
iceCream=float(input("liters of ice cream consumed per year?"))
datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
normMat,ranges,minVals=autoNorm(datingDataMat)
inArr=array([ffMiles,percentTats,iceCream])
classifierResult=KNN.classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
print("You will probably like this person:",resultList[classifierResult-1])
#fig =plt.figure()
#ax=fig.add_subplot(111)
#ax.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*array(b),15.0*array(b))
#plt.show()
classifyPerson()
代码在我的pc上测试没有错误(python3.0),大家如果在运行过程中,遇到什么问题欢迎讨论,请留言。