数据预处理程序参考

import pandas as pd
# from sklearn.model_selection import train_test_split
df=pd.read_csv("F:/Advertising.csv")
#缺失数据填充
from sklearn.preprocessing import Imputer
imr=Imputer(missing_values='NAN',strategy='mean',axis=0)
imr=imr.fit(df)
data=imr.transform(df.values)
#有序特征的映射
size_mapping={'XL':3,'L':2,'M':1}
df['size']=df['size'].map(size_mapping)
#类标的编码  
import numpy as np
class_mapping={label:idx for idx,label in enumerate(np.unique(df['classlabel']))}
df['classlabel']=df['classlabel'].map(class_mapping)
#将映射字典中的键-值对倒置,以将换转过来的类标还原回原始字符串
inv_class_mapping={v: k for k,v in class_mapping.items()}
df['classlabel']=df['classlabel'].map(inv_class_mapping)
#使用sklearn中的LabelEncoder类完成对类标的编码工作
from sklearn.preprocessing import LabelEncoder
class_le=LabelEncoder()
y=class_le.fit_transform(df['classlabel'].values)
#使用inverse_transform方法将整数类标还原为原始的字符串
class_le.inverse_transform(y)
#标称特征上的独热编码
import pandas as pd
df=pd.DataFrame([['green',10,'class1'],['red',13,'class2'],['blue',15,'class1']])
df.columns=['color','price','classlabel']
x=df[['color','price']].values
print(x)
from sklearn.preprocessing import LabelEncoder
color_le=LabelEncoder()
x[:,0]=color_le.fit_transform(x[:,0])
 
print(x)
 

from sklearn.preprocessing import OneHotEncoder

#利用随机森林判定重要性

x,y=data.iloc[:,1:].values,data.iloc[:,0].values



x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)
from sklearn.ensemble import RandomForestClassifier
labels=data.columns[1:]
forest=RandomForestClassifier(n_estimators=10000000000,random_state=0,n_jobs=-1)
forest.fit(x_train,y_train.astype(int)) 
importances=forest.feature_importances_
indices=np.argsort(importances)[::-1]
for i in range(x_train.shape[1]):
    print("%2d) %-*s %f"%(i+1,30,labels[i],importances[indices[i]]))
#可视化
import matplotlib.pyplot as plt
plt.title("Features Importances")
plt.bar(range(x_train.shape[1]),importances[indices],color='lightblue',align='center')
plt.xticks(range(x_train.shape[1]),labels,rotation=90)
plt.xlim([-1,x_train.shape[1]])
plt.tight_layout()
plt.show()   


猜你喜欢

转载自blog.csdn.net/qq_27584277/article/details/80351485
今日推荐