knn算法实例-用knn算法改进约会网站的配对效果

步骤:

1、收集数据

2、准备数据

3、分析数据

4、训练算法

5、测试算法

6、使用算法

1、本文使用的数据是海伦收集的约会数据,可以从 https://download.csdn.net/download/zuyuhuo6777/10627552下载。(datingTestSet2.txt)

详细代码如下并附有详细解释:

#准备数据:从文本文件中解析数据
from numpy import *
import operator
import matplotlib
import matplotlib.pyplot as plt
#from KNN import classify0
import KNN

def file2matrix(filename):#读文件
    fr=open(filename)#打开一个文件
    arrayOLines=fr.readlines()  #按照行读取
    numberOFLines=len(arrayOLines)
    returnMat=zeros((numberOFLines,3))
    classLabelVector=[]
    index=0
    for line in arrayOLines:
        line=line.strip() #裁掉回车字符
        listFromLine=line.split('\t')#元素列表
        returnMat[index,:]=listFromLine[0:3] #选取前三个元素放于列表
        classLabelVector.append(int(listFromLine[-1]))#-1表示最后一列
        index+=1
    return returnMat,classLabelVector

def autoNorm(dataSet):  #归一化特征值
    minVals=dataSet.min(0)  #从列中选取最小值
    maxVals=dataSet.max(0)  #从列中选取最大值
    ranges=maxVals-minVals  #得到范围
    normDataset=zeros(shape(dataSet)) 
    m=dataSet.shape[0]#读列,看行数
    normDataset=dataSet-tile(minVals,(m,1))
    normDataset=normDataset/tile(ranges,(m,1))
    return normDataset, ranges, minVals

def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]
    diffMat = tile(inX, (dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort()     
    classCount={}          
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]
    
def datingClassTest():#取数据的10%进行测试
    hoRatio=0.05
    datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
    normMat,ranges,minVals=autoNorm(datingDataMat)
    m=normMat.shape[0]#行数
    numTestVecs=int(m*hoRatio)#选取10%数据用来测试
    errorCount=0.0
    for i in range(numTestVecs):
        classifierResult=KNN.classify0(normMat[i,:],normMat[numTestVecs:m,:],
                                   datingLabels[numTestVecs:m],3)
        print("the classifer came back with :%d,the real answer is :%d"
              %(classifierResult,datingLabels[i]))
        if (classifierResult!=datingLabels[i]): errorCount+=1
    print("the total error rate is :%f" %(errorCount/float(numTestVecs)))

        

def classifyPerson():#新数据测试
    resultList=['not at all','in small doses','in large doses']
    percentTats=float(input("percentage of time spent playing video games?"))
    ffMiles=float(input("frequent filter miles earned per year"))
    iceCream=float(input("liters of ice cream consumed per year?"))
    datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
    normMat,ranges,minVals=autoNorm(datingDataMat)
    inArr=array([ffMiles,percentTats,iceCream])
    classifierResult=KNN.classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
    print("You will probably like this person:",resultList[classifierResult-1])
    

#fig =plt.figure()
#ax=fig.add_subplot(111)
#ax.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*array(b),15.0*array(b))
#plt.show()

classifyPerson()

代码在我的pc上测试没有错误(python3.0),大家如果在运行过程中,遇到什么问题欢迎讨论,请留言。

猜你喜欢

转载自blog.csdn.net/zuyuhuo6777/article/details/82081323