工作中遇到的问题3:seaborn画图,特征重要性,洗牌(重组),代码运行时间

1、画图

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#para_list为字段名,train为数据集,'label'为标签:0,1(观察0,1数据的分布)
for para in paraz_list:
    facet = sns.FacetGrid(train, hue="label",aspect=4)
    facet.map(sns.kdeplot,para,shade= True)
    facet.set(xlim=(train[para].min(), train[para].max()))
    facet.add_legend()

2、相关性

sns.set()
paraz_list.append('label')
sns.pairplot(train[paraz_list], size = 2.5)
plt.show()
#data.corr()-->correlation matrix
sns.heatmap(train.corr('spearman'),annot=True,cmap='RdYlGn',linewidths=0.2) 
fig=plt.gcf()
fig.set_size_inches(10,8)
plt.show()

3、标准化,模型训练,输出特征重要性

#标准化
X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)
#输出特征重要性
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
pre = rfr.predict(X_test)

imp = rfr.feature_importances_
imp = pd.DataFrame({'feature': X_all.columns, 'score': imp})
print(imp.sort_values(by='score',ascending=False))
#观察模型对验证集的预测效果
from scipy.stats import probplot
f = plt.figure(figsize=(8, 6))
ax = f.add_subplot(111)
probplot(y_test - pre, plot=ax)

4、洗牌,重新组合

#按列
X = pd.DataFrame()
for i  in range(0,1000,100):
    for col in df_train.columns:
        x = list(df_train[col])
        shuffle(x)
        df_train[col] = x
    X = pd.concat([X,df_train],axis=0)

5、随机生成样本进行预测,并保存

n = 0
m =0
import os
if os.path.isfile('data.csv'):
        os.remove('data.csv')

with open('data.csv', 'w',newline='') as csvfile:
    fieldnames = [ 'para1', 'para2', 'para3','label']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    while n < 100:  
        data = []   
        for i in ['para'+str(i) for i in [1,2,3]]:
        #     a = (random.randint(data[i].min(),data[i].max()))    
            data.append(random.uniform(test[i].min(),test[i].max()))  # 生成浮点数
        ypred = clf.predict([data])
        m = m +1
#         print(ypred)
        if 0.2> ypred[0] >=0.18:  # 条件
#             print(ypred[0])
            writer.writerow({ 'para1':data[1], 'para2':data[2],  'para3':data[3],
                            'label':ypred[0]})
            n = n + 1
    csvfile.close() 
print('生成了%d个随机数'%m)
print('%d满足条件的'%n)

6、记录代码运行时间

import datetime
starttime = datetime.datetime.now()
endtime = datetime.datetime.now()
print('use time:',endtime - starttime))

猜你喜欢

转载自blog.csdn.net/sisteryaya/article/details/79567273