《机器学习实战》中关于kmeans的代码会报各种错误
修改后的代码如下
供大家参考
def loadDataSet(fileName):
dataMat=[]
fr=open(fileName)
for line in fr.readlines():
curLine=line.strip().split(' ')
fltLine=list(map(float,curLine))
dataMat.append(fltLine)
return dataMat
# 距离函数
import numpy as np
def distEclud(vecA,vecB):
return np.sqrt(np.sum(np.power(vecA-vecB,2)))
# 初始化质心
def randCent(dataSet,k):
n=np.shape(dataSet)[1]
dataSet=np.mat(dataSet)
centriods=np.mat(np.zeros((k,n)))
for j in range(n):
minJ=np.min(dataSet[:,j])
rangeJ=np.max(dataSet[:,j]-minJ)
centriods[:,j]=minJ+rangeJ*np.random.rand(k,1)
return centriods
def kMeans(dataSet,k,distMeas=distEclud,createCent=randCent):
m=np.shape(dataSet)[0]
clusterAssment=np.mat(np.zeros((m,2))) # 第一列放质心的索引 第二列放到质心的距离
dataSet=np.mat(dataSet)
centroids=np.mat(createCent(dataSet,k))
clusterChanged=True
ks = 0
while clusterChanged:
clusterChanged=False
for i in range(m):
minDist=np.inf
minIndex=-1
for j in range(k):
distJI=distMeas(centroids[j,:],dataSet[i,:])
if distJI<minDist:
minDist=distJI;minIndex=j
# 只要样本的簇序数有变化 就继续迭代 直到不变
if clusterAssment[i,0]!=minIndex:
clusterChanged=True
clusterAssment[i,:]=minIndex,minDist**2
ks += 1
for cent in range(k):
pstInClust=dataSet[np.nonzero(clusterAssment[:,0].A==cent)[0]]
centroids[cent,:]=np.mean(pstInClust,axis=0)
return centroids,clusterAssment
# 测试
# dataMat=loadDataSet('testSet.txt')
# dataMat=np.mat(dataMat)
# centroids,clusterAssment=kMeans(dataMat,4)
#
#
# #绘图
# import matplotlib.pyplot as plt
# fig=plt.figure()
# plt.scatter(dataMat[:,0].flatten().A[0],dataMat[:,1].flatten().A[0],c=clusterAssment[:,0].flatten().A[0])
# plt.scatter(centroids[:,0].flatten().A[0],centroids[:,1].flatten().A[0],marker='+',c='red')
# plt.show()
# 二分K均值聚类算法
def biKmeans(dataSet,k,distMeas=distEclud):
m=np.shape(dataSet)[0]
clusterAssment=np.mat(np.zeros((m,2)))
centrioid0=np.mean(dataSet,axis=0).tolist()[0]
centList=[centrioid0]
print(centList)
for j in range(m):
clusterAssment[j,1]=distMeas(np.mat(centrioid0),dataSet[j,:])**2
while(len(centList)<k):
lowestSSE=np.inf
for i in range(len(centList)):
ptsInCurrCluster=dataSet[np.nonzero(clusterAssment[:,0].A==i)[0],:]
centroidMat,splitClustAss=kMeans(ptsInCurrCluster,2,distMeas)
sseSplit=np.sum(splitClustAss[:,1])
sseNotSplit=np.sum(clusterAssment[np.nonzero(clusterAssment[:,0].A!=i)[0],1])
print("划分部分的误差:"+str(sseSplit))
print("为划分部分的误差:"+str(sseNotSplit))
if(sseNotSplit+sseSplit)<lowestSSE:
bestCentToSplit=i
bestNewCents=centroidMat
bestClustAss=splitClustAss.copy()
lowestSSE=sseNotSplit+sseSplit
bestClustAss[np.nonzero(bestClustAss[:,0].A==1)[0],0]=len(centList)
bestClustAss[np.nonzero(bestClustAss[:,0].A == 0)[0], 0] = bestCentToSplit
centList[bestCentToSplit]=bestNewCents[0,:].tolist()[0]
centList.append(bestNewCents[1, :].tolist()[0])
clusterAssment[np.nonzero(clusterAssment[:,0].A==bestCentToSplit)[0],:]=bestClustAss
return np.mat(centList),clusterAssment
# 测试
dataMat=loadDataSet('testSet2.txt')
dataMat=np.mat(dataMat)
centList,clusterAssment=biKmeans(dataMat,3)
#绘图
import matplotlib.pyplot as plt
fig=plt.figure()
plt.scatter(dataMat[:,0].flatten().A[0],dataMat[:,1].flatten().A[0],c=clusterAssment[:,0].flatten().A[0])
plt.scatter(centList[:,0].flatten().A[0],centList[:,1].flatten().A[0],marker='+',c='red')
plt.show()
K-Means聚类算法实现
猜你喜欢
转载自blog.csdn.net/uncledrew2017/article/details/82803860
今日推荐
周排行