通过pca可以查看出每个特征的重要性,通过比较可以去除重要性低的特征,减少收集数据的成本
示例:该示例可以将数据中的590个特征缩减为6个。
pca.py
'''
Created on 2018年8月1日
@author: hcl
'''
from numpy import *
import matplotlib.pyplot as plt
# 加载数据
def loadDataSet(filename,delim = '\t'):
fr = open(filename)
stringArr = [line.strip().split(delim) for line in fr.readlines()]
dataArr = [list(map(float,line)) for line in stringArr]
return mat(dataArr)
def pca(dataMat,topN=999999):
# 形成样本矩阵,样本中心化
meanVals= mean(dataMat,axis=0)
meanRemoved = dataMat - meanVals
# 计算样本矩阵的协方差矩阵
covMat = cov(meanRemoved,rowvar=0)
# 对协方差矩阵进行特征值分解,选取最大的 p 个特征值对应的特征向量组成投影矩阵
eigVals,eigVects = linalg.eig(mat(covMat))
eigValInd = argsort(eigVals)
eigValInd = eigValInd[:-(topN+1):-1]
redEigVects = eigVects[:,eigValInd]
# 对原始样本矩阵进行投影,得到降维后的新样本矩阵
lowDDataMat = meanRemoved * redEigVects
reconMat = (lowDDataMat * redEigVects.T)+meanVals
return lowDDataMat,reconMat
#缺失值处理函数
def replaceNaNWithMean():
#解析数据
datMat=loadDataSet('secom.data.txt',' ')
#获取特征维度
numFeat=shape(datMat)[1]
#遍历数据集每一个维度
for i in range(numFeat):
#利用该维度所有非NaN特征求取均值
meanVal=mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i])
#将该维度中所有NaN特征全部用均值替换
datMat[nonzero(isnan(datMat[:,i].A))[0],i]=meanVal
return datMat
if __name__ == '__main__':
# dataMat = loadDataSet('testSet.txt')
# lowMat,reconMat = pca(dataMat,1)
# fig = plt.figure()
# ax = fig.add_subplot(111)
# ax.scatter(dataMat[:,0].flatten().A[0],dataMat[:,1].flatten().A[0],marker='^',s=90)
# ax.scatter(reconMat[:,0].flatten().A[0],reconMat[:,1].flatten().A[0],marker='o',s=50,c='red')
# plt.show()
dataMat=replaceNaNWithMean()
meanVals=mean(dataMat,axis=0)
meanRemoved=dataMat-meanVals
conMat=cov(meanRemoved,rowvar=0)
eigVals,eigVects=linalg.eig(mat(conMat))
print(eigVals.shape)
输出:
[ 5.34151979e+07+0.00000000e+00j 2.17466719e+07+0.00000000e+00j
8.24837662e+06+0.00000000e+00j 2.07388086e+06+0.00000000e+00j
1.31540439e+06+0.00000000e+00j 4.67693557e+05+0.00000000e+00j
2.90863555e+05+0.00000000e+00j 2.83668601e+05+0.00000000e+00j
2.37155830e+05+0.00000000e+00j 2.08513836e+05+0.00000000e+00j
1.96098849e+05+0.00000000e+00j 1.86856549e+05+0.00000000e+00j
1.52422354e+05+0.00000000e+00j 1.13215032e+05+0.00000000e+00j
1.08493848e+05+0.00000000e+00j 1.02849533e+05+0.00000000e+00j
...
0.00000000e+00+0.00000000e+00j 0.00000000e+00+0.00000000e+00j
0.00000000e+00+0.00000000e+00j 0.00000000e+00+0.00000000e+00j
0.00000000e+00+0.00000000e+00j 0.00000000e+00+0.00000000e+00j
0.00000000e+00+0.00000000e+00j 0.00000000e+00+0.00000000e+00j
0.00000000e+00+0.00000000e+00j 0.00000000e+00+0.00000000e+00j]