For data, please refer to {python data—test2} in (Computer F Disk) or (Tencent Weiyun Document "Redhur's Advanced")
Beijing,2959.19,730.79,749.41,513.34,467.87,1141.82,478.42,457.64
Tianjin,2459.77,495.47,697.33,302.87,284.19,735.97,570.84,305.08
Hebei,1495.63,515.90,362.37,285.32,272.95,540.58,364. 188.63
Shanxi,1406.33,477.77,290.15,208.57,201.50,414.72,281.84,212.10
…
import numpy as np
from sklearn.cluster import KMeans
def loadData(filePath):
fr = open(filePath,'r+',encoding='utf-8')
lines = fr.readlines()
retData = []
retCityName = []
for line in lines:
items = line.strip().split(",")
retCityName.append(items[0])
retData.append([float(items[i]) for i in range(1,len(items))])
return retData,retCityName
'''
retData大致的模样是:
[[2959.19, 730.79, 749.41, 513.34, 467.87, 1141.82, 478.42, 457.64],
[2459.77, 495.47, 697.33, 302.87, 284.19, 735.97, 570.84, 305.08]...]
K-Means聚类算法默认用的是欧氏距离
'''
if __name__=='__main__':
filepath = r'E:\快乐的程序猿\city.txt'
data,cityName=loadData(filepath)
km=KMeans(n_clusters=4) #n_cluster用于指定聚类中心的个数
label=km.fit_predict(data)
#fit_predict():计算簇中心以及为簇分配序号;
#label:聚类后各数据所属的标签,大致是[2 0 3 3 3 1 3 3 2 0 0 1 0 3 1 3 1 1 2 1 1 0 1 1 0 0 3 3 3 3 3]的样子
print(km.cluster_centers_) ##每个簇的每种消费的mean值
print("--------------------------------------------------------------------")
expenses=np.sum(km.cluster_centers_,axis=1) #每个簇的平均总消费()
print(expenses)
print("--------------------------------------------------------------------")
CityCluster=[[],[],[],[]] #将城市 按label分成设定的簇,将每个簇的城市输出
for i in range(len(cityName)):
CityCluster[label[i]].append(cityName[i])
for i in range(len(CityCluster)):
print("Expenses:{}.2f" .format(expenses[i]))
print(CityCluster[i])