年薪预测任务(50k)

年薪预测任务

1.数据下载并设置格式

在官网上下载数据,并设置数据位置并加标签

import numpy as np
import pandas as pd
train_path = "./data/adult.data"
test_path = "./data/adult.test"
train_set = pd.read_csv(train_path, header=None)
test_set = pd.read_csv(test_path, skiprows = 1, header = None)
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
                            'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
                            'hours_per_week', 'native_country', 'wage_class']
train_set.columns = col_labels
test_set.columns = col_labels

2.替换缺失数据

train_set.replace(' ?', np.nan).dropna().shape
test_set.replace(' ?', np.nan).dropna().shape

train_nomissing = train_set.replace(' ?', np.nan).dropna()
test_nomissing = test_set.replace(' ?', np.nan).dropna()

3.参数优化过程

3.1 max_depth和min_child_weight参数优化过程

cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8,
             'objective': 'binary:logistic'}
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params),
                            cv_params,
                            scoring = 'accuracy', cv = 5, n_jobs = -1)
# Optimize for accuracy since that is the metric used in
optimized_GBM.fit(final_train, y_train)

输出结果

print (optimized_GBM.best_params_)
print (optimized_GBM.cv_results_['mean_test_score'])
{'max_depth': 3, 'min_child_weight': 5}
[0.86648763 0.86668656 0.86675287 0.86075194 0.86144818 0.86164711
0.85485047 0.85465155 0.85683973]

3.2 learning_rate和subsample优化过程

cv_params = {'learning_rate': [0.1, 0.01], 'subsample': [0.7,0.8,0.9]}
ind_params = {'n_estimators': 1000, 'seed':0, 'colsample_bytree': 0.8,
             'objective': 'binary:logistic', 'max_depth': 3, 'min_child_weight': 5}
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params),
                            cv_params,
                             scoring = 'accuracy', cv = 5, n_jobs = -1)
optimized_GBM.fit(final_train, y_train)

输出结果

print (optimized_GBM.best_params_)
print (optimized_GBM.cv_results_['mean_test_score'])
{'learning_rate': 0.1, 'subsample': 0.8}
[0.86559247 0.86675287 0.86595716 0.86018832 0.86038724 0.85992308]

4. 最优num_boost_round

xgdmat = xgb.DMatrix(final_train, y_train) # Create our DMatrix to make XGBoost more efficient
our_params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8,
             'objective': 'binary:logistic', 'max_depth':3, 'min_child_weight':5}
# Grid Search CV optimized settings
cv_xgb = xgb.cv(params = our_params, dtrain = xgdmat, num_boost_round = 3000, nfold = 5,
                metrics = ['error'], # Make sure you enter metrics inside a list or you may encounter issues!
                early_stopping_rounds = 100) # Look for early stopping that minimizes error

输出结果

print (cv_xgb.tail(5))
429          0.119190         0.001485         0.131556        0.004630
430          0.119090         0.001430         0.131457        0.004759
431          0.119123         0.001423         0.131457        0.004752
432          0.119049         0.001482         0.131291        0.004626
433          0.119065         0.001603         0.131059        0.004543

参考设置

our_params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8,
             'objective': 'binary:logistic', 'max_depth':3, 'min_child_weight':5}
final_gb = xgb.train(our_params, xgdmat, num_boost_round = 433)

数据可视化分析

#matplotlib inline
import seaborn as sns
from numpy.random import randn
import matplotlib.pyplot as plt
sns.set(font_scale = 1.5)
xgb.plot_importance(final_gb)
#显示重要性
#plt.show()
importances = final_gb.get_fscore()

importance_frame = pd.DataFrame({'Importance': list(importances.values()), 'Feature': list(importances.keys())})
importance_frame.sort_values(by = 'Importance', inplace = True)
importance_frame.plot(kind = 'barh', x = 'Feature', figsize = (8,8), color = 'orange')
#plt.show()

重要性结果显示

print(importances)
{'relationship': 146, 'education_num': 158, 'capital_gain': 329, 'hours_per_week': 237,
'education': 76, 'age': 325, 'capital_loss': 234, 'marital_status': 105, 'fnlwgt': 488,
'occupation': 234, 'sex': 42, 'workclass': 103, 'race': 53, 'native_country': 94}

实验结果

testdmat = xgb.DMatrix(final_test)
from sklearn.metrics import accuracy_score
y_pred = final_gb.predict(testdmat) # Predict using our testdmat
#print (y_pred)
y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] = 0
#print (y_pred)

准确度

print (accuracy_score(y_pred, y_test), 1-accuracy_score(y_pred, y_test))
# 0.8678618857901726 0.13213811420982735

随机森林对比

参考代码

import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn", lineno=196)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
rf = RandomForestClassifier(n_jobs=4)
rf.fit(final_train, y_train)
Y_pred = rf.predict(final_test)

参考结果
准确率

print (accuracy_score(Y_pred, y_test), 1-accuracy_score(Y_pred, y_test))
#0.8415670650730411 0.15843293492695887
发布了80 篇原创文章 · 获赞 86 · 访问量 9万+

猜你喜欢

转载自blog.csdn.net/klaus_x/article/details/103831775
今日推荐