二手车预测task4打卡

通过听小雨姑娘的讲解，深受启发，下面是我对本次打卡的一些探索和总结，话不多说直接上代码：

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

sample_feature = reduce_mem_usage(pd.read_csv('data_for_tree.csv'))
continuous_feature_names = [x for x in sample_feature.columns if x not in ['price','brand','model','brand']]

Memory usage of dataframe is 61998896.00 MB
Memory usage after optimization is: 16493494.00 MB
Decreased by 73.4%

sample_feature = sample_feature.dropna().replace('-', 0).reset_index(drop=True)
sample_feature['notRepairedDamage'] = sample_feature['notRepairedDamage'].astype(np.float32)
train = sample_feature[continuous_feature_names + ['price']]

train_X = train[continuous_feature_names]
train_y = train['price']

from sklearn.linear_model import LinearRegression

model = LinearRegression(normalize=True)
model = model.fit(train_X, train_y)

print('intercept:'+ str(model.intercept_))
sorted(dict(zip(continuous_feature_names, model.coef_)).items(), key=lambda x:x[1], reverse=True)
#线性回归模型的截距（intercept）与权重(coef)

intercept:-110670.68277246761





[('v_6', 3367064.341641827),
 ('v_8', 700675.5609398744),
 ('v_9', 170630.27723220555),
 ('v_7', 32322.661932023566),
 ('v_12', 20473.670796983995),
 ('v_3', 17868.079541508385),
 ('v_11', 11474.938996718529),
 ('v_13', 11261.764560018768),
 ('v_10', 2683.920090597511),
 ('gearbox', 881.8225039247808),
 ('fuelType', 363.90425072163765),
 ('bodyType', 189.60271012070908),
 ('city', 44.949751205249136),
 ('power', 28.5539016167488),
 ('brand_price_median', 0.5103728134078717),
 ('brand_price_std', 0.4503634709263301),
 ('brand_amount', 0.1488112039506551),
 ('brand_price_max', 0.0031910186703129327),
 ('SaleID', 5.355989919855894e-05),
 ('train', 2.4586915969848633e-07),
 ('offerType', -1.651933416724205e-06),
 ('seller', -4.1157472878694534e-06),
 ('brand_price_sum', -2.175006868187571e-05),
 ('name', -0.00029800127131154245),
 ('used_time', -0.00251589433286487),
 ('brand_price_average', -0.4049048451011336),
 ('brand_price_min', -2.2467753486887223),
 ('power_bin', -34.420644117251825),
 ('v_14', -274.78411807779867),
 ('kilometer', -372.8975266606955),
 ('notRepairedDamage', -495.1903844627379),
 ('v_0', -2045.0549573556823),
 ('v_5', -11022.986240550124),
 ('v_4', -15121.731109859189),
 ('v_2', -26098.29992055111),
 ('v_1', -45556.189297274395)]

subsample_index = np.random.randint(low=0, high=len(train_y), size=50)
plt.scatter(train_X['v_9'][subsample_index], train_y[subsample_index], color='black')
plt.scatter(train_X['v_9'][subsample_index], model.predict(train_X.loc[subsample_index]), color='blue')
plt.xlabel('v_9')
plt.ylabel('price')
plt.legend(['True Price','Predicted Price'],loc='upper right')
plt.show()
#图片显示模型的预测结果（蓝色点）与真实标签（黑色点）的分布差异较大

在这里插入图片描述

plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train_y)
plt.subplot(1,2,2)
sns.distplot(train_y[train_y < np.quantile(train_y, 0.9)])
#查看数据真实分布状态，发现数据的标签（price）呈现长尾分布，不符合正态分布

<matplotlib.axes._subplots.AxesSubplot at 0x1e64514fa90>

在这里插入图片描述

train_y_ln = np.log(train_y + 1)#对标签进行了 log(x+1)log(x+1) 变换，使标签贴近于正态分布
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train_y_ln)
plt.subplot(1,2,2)
sns.distplot(train_y_ln[train_y_ln < np.quantile(train_y_ln, 0.9)])

<matplotlib.axes._subplots.AxesSubplot at 0x1e64590ccf8>

在这里插入图片描述

#查看模型截距（intercept）与权重(coef)
model = model.fit(train_X, train_y_ln)

print('intercept:'+ str(model.intercept_))
sorted(dict(zip(continuous_feature_names, model.coef_)).items(), key=lambda x:x[1], reverse=True)

intercept:18.750745460080392





[('v_9', 8.052411927761039),
 ('v_5', 5.764248502276934),
 ('v_12', 1.6182066744718018),
 ('v_1', 1.4798302934385128),
 ('v_11', 1.1669014496974728),
 ('v_13', 0.9404706038647674),
 ('v_7', 0.7137295307904377),
 ('v_3', 0.6837865320343457),
 ('v_0', 0.008500525238639573),
 ('power_bin', 0.008497967226208911),
 ('gearbox', 0.007922377819953778),
 ('fuelType', 0.006684768278649912),
 ('bodyType', 0.004523520659141157),
 ('power', 0.0007161896117539691),
 ('brand_price_min', 3.334353082747484e-05),
 ('brand_amount', 2.8978800102546807e-06),
 ('brand_price_median', 1.2571119996608522e-06),
 ('brand_price_std', 6.659134278527834e-07),
 ('brand_price_max', 6.194957240893533e-07),
 ('brand_price_average', 5.999429489201407e-07),
 ('SaleID', 2.1194162066547424e-08),
 ('train', -2.9558577807620168e-12),
 ('offerType', -4.3874237576346786e-11),
 ('seller', -1.3236878260158846e-10),
 ('brand_price_sum', -1.5126510445824183e-10),
 ('name', -7.015510649909473e-08),
 ('used_time', -4.122477171058659e-06),
 ('city', -0.0022187835425504236),
 ('v_14', -0.004234186905404002),
 ('kilometer', -0.013835866887579094),
 ('notRepairedDamage', -0.2702794206248401),
 ('v_4', -0.8315696877542701),
 ('v_2', -0.9470831015181023),
 ('v_10', -1.626147367313265),
 ('v_8', -40.34300698769784),
 ('v_6', -238.79035828045355)]

plt.scatter(train_X['v_9'][subsample_index], train_y[subsample_index], color='black')
plt.scatter(train_X['v_9'][subsample_index], np.exp(model.predict(train_X.loc[subsample_index])), color='blue')
plt.xlabel('v_9')
plt.ylabel('price')
plt.legend(['True Price','Predicted Price'],loc='upper right')
plt.show()
#继续进行可视化，预测结果与真实值较为接近

在这里插入图片描述

#五折交叉验证
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,  make_scorer
def log_transfer(func):
    def wrapper(y, yhat):
        result = func(np.log(y), np.nan_to_num(np.log(yhat)))
        return result
    return wrapper
scores = cross_val_score(model, X=train_X, y=train_y, verbose=1, cv = 5, scoring=make_scorer(log_transfer(mean_absolute_error)))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.1s finished

np.mean(scores)#未处理标签的特征数据进行五折交叉验证

1.36580240424085

scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=1, cv = 5, scoring=make_scorer(mean_absolute_error))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.1s finished

np.mean(scores)#对处理标签的特征数据进行五折交叉验证

0.1932530153517687

scores = pd.DataFrame(scores.reshape(1,-1))
scores.columns = ['cv' + str(x) for x in range(1, 6)]
scores.index = ['MAE']
scores

	cv1	cv2	cv3	cv4	cv5
MAE	0.190792	0.193758	0.194132	0.191825	0.195758

#模拟真实业务情况
import datetime
sample_feature = sample_feature.reset_index(drop=True)
split_point = len(sample_feature) // 5 * 4
train = sample_feature.loc[:split_point].dropna()
val = sample_feature.loc[split_point:].dropna()

train_X = train[continuous_feature_names]
train_y_ln = np.log(train['price'] + 1)
val_X = val[continuous_feature_names]
val_y_ln = np.log(val['price'] + 1)
model = model.fit(train_X, train_y_ln)
mean_absolute_error(val_y_ln, model.predict(val_X))

0.19577667040507432

#绘制学习率曲线与验证曲线
from sklearn.model_selection import learning_curve, validation_curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,n_jobs=1, train_size=np.linspace(.1, 1.0, 5 )):  
    plt.figure()  
    plt.title(title)  
    if ylim is not None:  
        plt.ylim(*ylim)  
    plt.xlabel('Training example')  
    plt.ylabel('score')  
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_size, scoring = make_scorer(mean_absolute_error))  
    train_scores_mean = np.mean(train_scores, axis=1)  
    train_scores_std = np.std(train_scores, axis=1)  
    test_scores_mean = np.mean(test_scores, axis=1)  
    test_scores_std = np.std(test_scores, axis=1)  
    plt.grid()#区域  
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,  
                     train_scores_mean + train_scores_std, alpha=0.1,  
                     color="r")  
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,  
                     test_scores_mean + test_scores_std, alpha=0.1,  
                     color="g")  
    plt.plot(train_sizes, train_scores_mean, 'o-', color='r',  
             label="Training score")  
    plt.plot(train_sizes, test_scores_mean,'o-',color="g",  
             label="Cross-validation score")  
    plt.legend(loc="best")  
    return plt

plot_learning_curve(LinearRegression(), 'Liner_model', train_X[:1000], train_y_ln[:1000], ylim=(0.0, 0.5), cv=5, n_jobs=1)

<module 'matplotlib.pyplot' from 'H:\\aanaconda3\\lib\\site-packages\\matplotlib\\pyplot.py'>

在这里插入图片描述

#多种模型对比
train = sample_feature[continuous_feature_names + ['price']].dropna()
train_X = train[continuous_feature_names]
train_y = train['price']
train_y_ln = np.log(train_y + 1)

#线性模型 & 嵌入式特征选择
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
models = [LinearRegression(),
          Ridge(),
          Lasso()]
result = dict()
for model in models:
    model_name = str(model).split('(')[0]
    scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error))
    result[model_name] = scores
    print(model_name + ' is finished')

LinearRegression is finished
Ridge is finished
Lasso is finished

#三种方法效果对比
result = pd.DataFrame(result)
result.index = ['cv' + str(x) for x in range(1, 6)]
result

	LinearRegression	Ridge	Lasso
cv1	0.190792	0.194832	0.383899
cv2	0.193758	0.197632	0.381893
cv3	0.194132	0.198123	0.384090
cv4	0.191825	0.195670	0.380526
cv5	0.195758	0.199676	0.383611

model = LinearRegression().fit(train_X, train_y_ln)
print('intercept:'+ str(model.intercept_))
sns.barplot(abs(model.coef_), continuous_feature_names)

intercept:18.750745460114032





<matplotlib.axes._subplots.AxesSubplot at 0x1e6445255c0>

在这里插入图片描述

model = Ridge().fit(train_X, train_y_ln)
print('intercept:'+ str(model.intercept_))
sns.barplot(abs(model.coef_), continuous_feature_names)

intercept:4.671710857050353





<matplotlib.axes._subplots.AxesSubplot at 0x1e6440e4d30>

在这里插入图片描述

model = Lasso().fit(train_X, train_y_ln)
print('intercept:'+ str(model.intercept_))
sns.barplot(abs(model.coef_), continuous_feature_names)

intercept:8.672182455497687





<matplotlib.axes._subplots.AxesSubplot at 0x1e644255400>

在这里插入图片描述

#非线性模型
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm.sklearn import LGBMRegressor
models = [LinearRegression(),
          DecisionTreeRegressor(),
          RandomForestRegressor(),
          GradientBoostingRegressor(),
          MLPRegressor(solver='lbfgs', max_iter=100), 
          XGBRegressor(n_estimators = 100, objective='reg:squarederror'), 
          LGBMRegressor(n_estimators = 100)]
result = dict()
for model in models:
    model_name = str(model).split('(')[0]
    scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error))
    result[model_name] = scores
    print(model_name + ' is finished')

result = pd.DataFrame(result)
result.index = ['cv' + str(x) for x in range(1, 6)]
result

#模型调参
objective = ['regression', 'regression_l1', 'mape', 'huber', 'fair']

num_leaves = [3,5,10,15,20,40, 55]
max_depth = [3,5,10,15,20,40, 55]
bagging_fraction = []
feature_fraction = []
drop_rate = []
best_obj = dict()
for obj in objective:
    model = LGBMRegressor(objective=obj)
    score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_obj[obj] = score
    
best_leaves = dict()
for leaves in num_leaves:
    model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=leaves)
    score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_leaves[leaves] = score
    
best_depth = dict()
for depth in max_depth:
    model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
                          num_leaves=min(best_leaves.items(), key=lambda x:x[1])[0],
                          max_depth=depth)
    score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
    best_depth[depth] = score
sns.lineplot(x=['0_initial','1_turning_obj','2_turning_leaves','3_turning_depth'], y=[0.143 ,min(best_obj.values()), min(best_leaves.values()), min(best_depth.values())])

#调参
from sklearn.model_selection import GridSearchCV
parameters = {'objective': objective , 'num_leaves': num_leaves, 'max_depth': max_depth}
model = LGBMRegressor()
clf = GridSearchCV(model, parameters, cv=5)
clf = clf.fit(train_X, train_y)
clf.best_params_
{'max_depth': 15, 'num_leaves': 55, 'objective': 'regression'}
model = LGBMRegressor(objective='regression',
                          num_leaves=55,
                          max_depth=15)
np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))

from bayes_opt import BayesianOptimization
def rf_cv(num_leaves, max_depth, subsample, min_child_samples):
    val = cross_val_score(
        LGBMRegressor(objective = 'regression_l1',
            num_leaves=int(num_leaves),
            max_depth=int(max_depth),
            subsample = subsample,
            min_child_samples = int(min_child_samples)
        ),
        X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)
    ).mean()
    return 1 - val
rf_bo = BayesianOptimization(
    rf_cv,
    {
    'num_leaves': (2, 100),
    'max_depth': (2, 100),
    'subsample': (0.1, 1),
    'min_child_samples' : (2, 100)
    }
)
rf_bo.maximize()
print(1 - rf_bo.max['target'])

plt.figure(figsize=(13,5))
sns.lineplot(x=['0_origin','1_log_transfer','2_L1_&_L2','3_change_model','4_parameter_turning'], y=[1.36 ,0.19, 0.19, 0.14, 0.13])
#基本方法来提高预测的精度

qq_40791906

发布了3 篇原创文章 · 获赞 1 · 访问量 100

私信关注

二手车预测task4打卡

猜你喜欢