from numpy import * import operator from os import listdir import matplotlib import matplotlib.pyplot as plt # 数据集的路径 filePath = 'E:\\bigData\\ml\\dataset\\datingTestSet2.txt' # kNN核心算法 ''' classify0函数: 参数解释: inX:待分类样本 dataSet:已分类的数据集(训练集) labels:已分类的数据集的类别 k:选取最近距离的样本个数 功能解释: 通过计算样本inX与dataSet中各个样本的距离, 选出k个距离最近的样本, 挑选在这些样本中出现次数最多的种类, 将该种类预测作inX的种类 ''' def classify0(inX, dataSet, labels, k): # 读取训练集矩阵中向量(样本)的个数 dataSetSize = dataSet.shape[0] # 训练集向量(样本)与待分类向量(样本)的差值,类似于(x1-y1) diffMat = tile(inX, (dataSetSize, 1)) - dataSet # 类似于(x1-y1)^2 sqDiffMat = diffMat**2 # 类似于 (x1-y1)^2 + (x2-y2)^2 + ... + (xn-yn)^2 sqDistances = sqDiffMat.sum(axis=1) # 对sqDistances进行开方,得到两个样本的欧式距离 distances = sqDistances**0.5 # 计算出待分类样本与各个已分类样本的距离后, # 将这些距离从小到大排序,提取其对应的index(索引),输出到sortedDistIndicies # 例如:第一个样本与待分类样本的距离在所有距离中排第706名,那么将706记录到sortedDistIndicies列表中 sortedDistIndicies = distances.argsort() classCount = {} # 取出k个距离最近的样本。选出在这些样本中出现次数最多的种类,那么该种类就为预测结果 for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] # 由文件转换成矩阵 def file2matrix(filename): fr = open(filename) numberOfLines = len(fr.readlines()) #get the number of lines in the file returnMat = zeros((numberOfLines, 3)) #prepare matrix to return classLabelVector = [] #prepare labels return fr = open(filename) index = 0 for line in fr.readlines(): line = line.strip() listFromLine = line.split('\t') returnMat[index, :] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index += 1 return returnMat, classLabelVector # 将特征值进行归一化,映射到0-1之间 def autoNorm(dataSet): minVals = dataSet.min(0) maxVals = dataSet.max(0) ranges = maxVals - minVals normDataSet = zeros(shape(dataSet)) m = dataSet.shape[0] normDataSet = dataSet - tile(minVals, (m,1)) normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide return normDataSet, ranges, minVals # 测试kNN分类算法 def datingClassTest(): hoRatio = 0.50 #hold out 10% datingDataMat, datingLabels = file2matrix(filePath) #load data setfrom file normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] numTestVecs = int(m*hoRatio) errorCount = 0.0 for i in range(numTestVecs): classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])) if (classifierResult != datingLabels[i]): errorCount += 1.0 print("the total error rate is: %f" % (errorCount/float(numTestVecs))) print(errorCount) # 分类测试算法 def classifyPerson(): resultList = ['not at all', 'in small doses', 'in large doses'] percentTats = float(input("percentage of time spent playing video games?")) ffMiles = float(input("frequent flier miles earned per year?")) iceCream = float(input("litres of ice cream consumed per year?")) datingDataMat, datingLabels = file2matrix(filePath) normMat, ranges, minVals = autoNorm(datingDataMat) inArr = array([ffMiles, percentTats, iceCream]) classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3) print("Your will probably like this person:", resultList[classifierResult-1]) # 数据可视化函数(可选) def writeData2Picture(): datingDataMat, datingLabels = loadingData() fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(datingDataMat[:, 0], datingDataMat[:, 1], 15.0*array(datingLabels), 15.0*array(datingLabels)) plt.show() # 数据加载函数(可选) def loadingData(): datingDataMat, datingLabels = file2matrix('E:\\bigData\\ml\\dataset\\datingTestSet2.txt') # print(datingDataMat) # print(datingLabels) return datingDataMat, datingLabels # 第一第二个函数的测试函数,将其放在主函数中运行(可选) def myKNNTest1(): group, labels = loadingData() print(group) # 测试输出代码 category = classify0([0, 0], group, labels, 3) print(category) # 主函数 if __name__ == '__main__': classifyPerson()
代码运行截图
数据集下载地址:
链接:https://pan.baidu.com/s/1MR7CnBU8bZztb1tlpR4XyQ
密码:jeec