task1 day2 特征工程

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

loan=pd.read_csv('C:\\Users\\85382\\Desktop\\shixi\\loan.csv')

loan.head()
loan.shape

#查看因变量。违约与没有违约的人数
loan['status'].value_counts()

#统计列类型，浮点数、整数、字符串（object）
loan.dtypes
#大部分变量都是浮点数或者整数，不过bank_card_no、reg_preference_for_trad、source、id_me、latest_query_time、loans_latest_time都是字符串
#注：在特征选择中不能包含字符串
#除了分类变量reg_preference_for_trad，其他变量信息量小

loan_new=loan.dropna() #删除含有缺失值的行
total_rate=loan_new.shape[0]*100/loan.shape[0] #查看完整数据的比例
total_rate #数据集loan的完整性只有32.27%

#查看loan各个变量缺失比例
lack_loan = 100 * loan.isnull().sum() / len(loan)
#设置python输出结果行列数为100*100
pd.set_option('display.max_columns', None)
pd.set_option('max_columns',100) 
pd.set_option('max_row',100) 
lack_loan #student_feature的确实比例高达63%

#分类变量哑编码
city=pd.get_dummies(loan['reg_preference_for_trad'])
loan['first_city']=city['一线城市']
loan['second_city']=city['二线城市']
loan['third_city']=city['三线城市']
loan['other_city']=city['其他城市']
loan['oversea']=city['境外']
#初步筛选变量
#涉及到90个变量，其中student_feature变量缺失比例太高选择直接删除该变量，另外类似卡号、用户id这类意义不大的变量也剔除,latest_query_time等字符串时间列删除
#这样经过初步的筛选90个变量还剩下84个变量（包含哑变量）
loan_new=loan.drop(['Unnamed: 0','custid','trade_no','student_feature','latest_query_time','bank_card_no','first_transaction_time','loans_latest_time','source','id_me','reg_preference_for_trad'], 1)
loan_new.head()
loan_new.shape

#查看loan_new数据集数据的完整性
loan_new_unlack=loan_new.dropna()
total_rate=loan_new_unlack.shape[0]*100/loan_new.shape[0]#查看完整数据的比例
total_rate #完整性达到83.8%，缺失比例不高，这里选择删除掉有缺失行
#提取特征
X=loan_new_unlack.drop('status',axis=1)
#提取目标变量status
y=loan_new_unlack['status']
#到这里X，y都是非缺失的

#特征选择（此处运行失败是由于变量中存在字符串，因此无法运行成功，只选择数值型变量就可以了）
X.shape #共83个特征 
#方差选择法
from sklearn.feature_selection import VarianceThreshold

#方差选择法，返回值为特征选择后的数据
#参数threshold为方差的阈值
VarianceThreshold(threshold=3).fit_transform(X)

#L1方法
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new_1 = model.transform(X)
X_new_1.shape #特征减少至38个（基本稳定在38）
#决策树方法
clf = ExtraTreesClassifier().fit(X, y)
clf.feature_importances_  
model = SelectFromModel(clf, prefit=True)
X_new_2 = model.transform(X)
X_new_2.shape #特征减少至21个（上下波动较大，大概20~25）
#不显示warn
import warnings
warnings.filterwarnings('ignore')

#将数据分割为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)
#数据标准化
scaler = preprocessing.StandardScaler().fit(X)
#训练集数据标准化
X_train=scaler.transform(X_train)
#测试集数据标准化
X_test=scaler.transform(X_test)
#不显示warn
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
feat_labels=X.columns[1:]
forest=RandomForestClassifier(n_estimators=10000,n_jobs=-1,random_state=0)
forest.fit(X_train,y_train)
importances=forest.feature_importances_
indices=np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    #给予10000颗决策树平均不纯度衰减的计算来评估特征重要性
    print ("%2d) %-*s %f" % (f+1,30,feat_labels[f],importances[indices[f]]) )
#可视化特征重要性-依据平均不纯度衰减
plt.title('Feature Importance-RandomForest')
plt.bar(range(X_train.shape[1]),importances[indices],color='lightblue',align='center')
plt.xticks(range(X_train.shape[1]),feat_labels,rotation=90)
plt.xlim([-1,X_train.shape[1]])
plt.tight_layout()
plt.show()
task1 day2 特征工程

猜你喜欢