通过听小雨姑娘的讲解,深受启发,下面是我对本次打卡的一些探索和总结,话不多说直接上代码:
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
import seaborn as sns
import warnings
warnings. filterwarnings( 'ignore' )
def reduce_mem_usage ( df) :
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df. memory_usage( ) . sum ( )
print ( 'Memory usage of dataframe is {:.2f} MB' . format ( start_mem) )
for col in df. columns:
col_type = df[ col] . dtype
if col_type != object :
c_min = df[ col] . min ( )
c_max = df[ col] . max ( )
if str ( col_type) [ : 3 ] == 'int' :
if c_min > np. iinfo( np. int8) . min and c_max < np. iinfo( np. int8) . max :
df[ col] = df[ col] . astype( np. int8)
elif c_min > np. iinfo( np. int16) . min and c_max < np. iinfo( np. int16) . max :
df[ col] = df[ col] . astype( np. int16)
elif c_min > np. iinfo( np. int32) . min and c_max < np. iinfo( np. int32) . max :
df[ col] = df[ col] . astype( np. int32)
elif c_min > np. iinfo( np. int64) . min and c_max < np. iinfo( np. int64) . max :
df[ col] = df[ col] . astype( np. int64)
else :
if c_min > np. finfo( np. float16) . min and c_max < np. finfo( np. float16) . max :
df[ col] = df[ col] . astype( np. float16)
elif c_min > np. finfo( np. float32) . min and c_max < np. finfo( np. float32) . max :
df[ col] = df[ col] . astype( np. float32)
else :
df[ col] = df[ col] . astype( np. float64)
else :
df[ col] = df[ col] . astype( 'category' )
end_mem = df. memory_usage( ) . sum ( )
print ( 'Memory usage after optimization is: {:.2f} MB' . format ( end_mem) )
print ( 'Decreased by {:.1f}%' . format ( 100 * ( start_mem - end_mem) / start_mem) )
return df
sample_feature = reduce_mem_usage( pd. read_csv( 'data_for_tree.csv' ) )
continuous_feature_names = [ x for x in sample_feature. columns if x not in [ 'price' , 'brand' , 'model' , 'brand' ] ]
Memory usage of dataframe is 61998896.00 MB
Memory usage after optimization is: 16493494.00 MB
Decreased by 73.4%
sample_feature = sample_feature. dropna( ) . replace( '-' , 0 ) . reset_index( drop= True )
sample_feature[ 'notRepairedDamage' ] = sample_feature[ 'notRepairedDamage' ] . astype( np. float32)
train = sample_feature[ continuous_feature_names + [ 'price' ] ]
train_X = train[ continuous_feature_names]
train_y = train[ 'price' ]
from sklearn. linear_model import LinearRegression
model = LinearRegression( normalize= True )
model = model. fit( train_X, train_y)
print ( 'intercept:' + str ( model. intercept_) )
sorted ( dict ( zip ( continuous_feature_names, model. coef_) ) . items( ) , key= lambda x: x[ 1 ] , reverse= True )
intercept:-110670.68277246761
[('v_6', 3367064.341641827),
('v_8', 700675.5609398744),
('v_9', 170630.27723220555),
('v_7', 32322.661932023566),
('v_12', 20473.670796983995),
('v_3', 17868.079541508385),
('v_11', 11474.938996718529),
('v_13', 11261.764560018768),
('v_10', 2683.920090597511),
('gearbox', 881.8225039247808),
('fuelType', 363.90425072163765),
('bodyType', 189.60271012070908),
('city', 44.949751205249136),
('power', 28.5539016167488),
('brand_price_median', 0.5103728134078717),
('brand_price_std', 0.4503634709263301),
('brand_amount', 0.1488112039506551),
('brand_price_max', 0.0031910186703129327),
('SaleID', 5.355989919855894e-05),
('train', 2.4586915969848633e-07),
('offerType', -1.651933416724205e-06),
('seller', -4.1157472878694534e-06),
('brand_price_sum', -2.175006868187571e-05),
('name', -0.00029800127131154245),
('used_time', -0.00251589433286487),
('brand_price_average', -0.4049048451011336),
('brand_price_min', -2.2467753486887223),
('power_bin', -34.420644117251825),
('v_14', -274.78411807779867),
('kilometer', -372.8975266606955),
('notRepairedDamage', -495.1903844627379),
('v_0', -2045.0549573556823),
('v_5', -11022.986240550124),
('v_4', -15121.731109859189),
('v_2', -26098.29992055111),
('v_1', -45556.189297274395)]
subsample_index = np. random. randint( low= 0 , high= len ( train_y) , size= 50 )
plt. scatter( train_X[ 'v_9' ] [ subsample_index] , train_y[ subsample_index] , color= 'black' )
plt. scatter( train_X[ 'v_9' ] [ subsample_index] , model. predict( train_X. loc[ subsample_index] ) , color= 'blue' )
plt. xlabel( 'v_9' )
plt. ylabel( 'price' )
plt. legend( [ 'True Price' , 'Predicted Price' ] , loc= 'upper right' )
plt. show( )
plt. figure( figsize= ( 15 , 5 ) )
plt. subplot( 1 , 2 , 1 )
sns. distplot( train_y)
plt. subplot( 1 , 2 , 2 )
sns. distplot( train_y[ train_y < np. quantile( train_y, 0.9 ) ] )
<matplotlib.axes._subplots.AxesSubplot at 0x1e64514fa90>
train_y_ln = np. log( train_y + 1 )
plt. figure( figsize= ( 15 , 5 ) )
plt. subplot( 1 , 2 , 1 )
sns. distplot( train_y_ln)
plt. subplot( 1 , 2 , 2 )
sns. distplot( train_y_ln[ train_y_ln < np. quantile( train_y_ln, 0.9 ) ] )
<matplotlib.axes._subplots.AxesSubplot at 0x1e64590ccf8>
model = model. fit( train_X, train_y_ln)
print ( 'intercept:' + str ( model. intercept_) )
sorted ( dict ( zip ( continuous_feature_names, model. coef_) ) . items( ) , key= lambda x: x[ 1 ] , reverse= True )
intercept:18.750745460080392
[('v_9', 8.052411927761039),
('v_5', 5.764248502276934),
('v_12', 1.6182066744718018),
('v_1', 1.4798302934385128),
('v_11', 1.1669014496974728),
('v_13', 0.9404706038647674),
('v_7', 0.7137295307904377),
('v_3', 0.6837865320343457),
('v_0', 0.008500525238639573),
('power_bin', 0.008497967226208911),
('gearbox', 0.007922377819953778),
('fuelType', 0.006684768278649912),
('bodyType', 0.004523520659141157),
('power', 0.0007161896117539691),
('brand_price_min', 3.334353082747484e-05),
('brand_amount', 2.8978800102546807e-06),
('brand_price_median', 1.2571119996608522e-06),
('brand_price_std', 6.659134278527834e-07),
('brand_price_max', 6.194957240893533e-07),
('brand_price_average', 5.999429489201407e-07),
('SaleID', 2.1194162066547424e-08),
('train', -2.9558577807620168e-12),
('offerType', -4.3874237576346786e-11),
('seller', -1.3236878260158846e-10),
('brand_price_sum', -1.5126510445824183e-10),
('name', -7.015510649909473e-08),
('used_time', -4.122477171058659e-06),
('city', -0.0022187835425504236),
('v_14', -0.004234186905404002),
('kilometer', -0.013835866887579094),
('notRepairedDamage', -0.2702794206248401),
('v_4', -0.8315696877542701),
('v_2', -0.9470831015181023),
('v_10', -1.626147367313265),
('v_8', -40.34300698769784),
('v_6', -238.79035828045355)]
plt. scatter( train_X[ 'v_9' ] [ subsample_index] , train_y[ subsample_index] , color= 'black' )
plt. scatter( train_X[ 'v_9' ] [ subsample_index] , np. exp( model. predict( train_X. loc[ subsample_index] ) ) , color= 'blue' )
plt. xlabel( 'v_9' )
plt. ylabel( 'price' )
plt. legend( [ 'True Price' , 'Predicted Price' ] , loc= 'upper right' )
plt. show( )
from sklearn. model_selection import cross_val_score
from sklearn. metrics import mean_absolute_error, make_scorer
def log_transfer ( func) :
def wrapper ( y, yhat) :
result = func( np. log( y) , np. nan_to_num( np. log( yhat) ) )
return result
return wrapper
scores = cross_val_score( model, X= train_X, y= train_y, verbose= 1 , cv = 5 , scoring= make_scorer( log_transfer( mean_absolute_error) ) )
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 1.1s finished
np. mean( scores)
1.36580240424085
scores = cross_val_score( model, X= train_X, y= train_y_ln, verbose= 1 , cv = 5 , scoring= make_scorer( mean_absolute_error) )
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 1.1s finished
np. mean( scores)
0.1932530153517687
scores = pd. DataFrame( scores. reshape( 1 , - 1 ) )
scores. columns = [ 'cv' + str ( x) for x in range ( 1 , 6 ) ]
scores. index = [ 'MAE' ]
scores
cv1
cv2
cv3
cv4
cv5
MAE
0.190792
0.193758
0.194132
0.191825
0.195758
import datetime
sample_feature = sample_feature. reset_index( drop= True )
split_point = len ( sample_feature) // 5 * 4
train = sample_feature. loc[ : split_point] . dropna( )
val = sample_feature. loc[ split_point: ] . dropna( )
train_X = train[ continuous_feature_names]
train_y_ln = np. log( train[ 'price' ] + 1 )
val_X = val[ continuous_feature_names]
val_y_ln = np. log( val[ 'price' ] + 1 )
model = model. fit( train_X, train_y_ln)
mean_absolute_error( val_y_ln, model. predict( val_X) )
0.19577667040507432
from sklearn. model_selection import learning_curve, validation_curve
def plot_learning_curve ( estimator, title, X, y, ylim= None , cv= None , n_jobs= 1 , train_size= np. linspace( .1 , 1.0 , 5 ) ) :
plt. figure( )
plt. title( title)
if ylim is not None :
plt. ylim( * ylim)
plt. xlabel( 'Training example' )
plt. ylabel( 'score' )
train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv= cv, n_jobs= n_jobs, train_sizes= train_size, scoring = make_scorer( mean_absolute_error) )
train_scores_mean = np. mean( train_scores, axis= 1 )
train_scores_std = np. std( train_scores, axis= 1 )
test_scores_mean = np. mean( test_scores, axis= 1 )
test_scores_std = np. std( test_scores, axis= 1 )
plt. grid( )
plt. fill_between( train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha= 0.1 ,
color= "r" )
plt. fill_between( train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha= 0.1 ,
color= "g" )
plt. plot( train_sizes, train_scores_mean, 'o-' , color= 'r' ,
label= "Training score" )
plt. plot( train_sizes, test_scores_mean, 'o-' , color= "g" ,
label= "Cross-validation score" )
plt. legend( loc= "best" )
return plt
plot_learning_curve( LinearRegression( ) , 'Liner_model' , train_X[ : 1000 ] , train_y_ln[ : 1000 ] , ylim= ( 0.0 , 0.5 ) , cv= 5 , n_jobs= 1 )
<module 'matplotlib.pyplot' from 'H:\\aanaconda3\\lib\\site-packages\\matplotlib\\pyplot.py'>
train = sample_feature[ continuous_feature_names + [ 'price' ] ] . dropna( )
train_X = train[ continuous_feature_names]
train_y = train[ 'price' ]
train_y_ln = np. log( train_y + 1 )
from sklearn. linear_model import Ridge
from sklearn. linear_model import Lasso
models = [ LinearRegression( ) ,
Ridge( ) ,
Lasso( ) ]
result = dict ( )
for model in models:
model_name = str ( model) . split( '(' ) [ 0 ]
scores = cross_val_score( model, X= train_X, y= train_y_ln, verbose= 0 , cv = 5 , scoring= make_scorer( mean_absolute_error) )
result[ model_name] = scores
print ( model_name + ' is finished' )
LinearRegression is finished
Ridge is finished
Lasso is finished
result = pd. DataFrame( result)
result. index = [ 'cv' + str ( x) for x in range ( 1 , 6 ) ]
result
LinearRegression
Ridge
Lasso
cv1
0.190792
0.194832
0.383899
cv2
0.193758
0.197632
0.381893
cv3
0.194132
0.198123
0.384090
cv4
0.191825
0.195670
0.380526
cv5
0.195758
0.199676
0.383611
model = LinearRegression( ) . fit( train_X, train_y_ln)
print ( 'intercept:' + str ( model. intercept_) )
sns. barplot( abs ( model. coef_) , continuous_feature_names)
intercept:18.750745460114032
<matplotlib.axes._subplots.AxesSubplot at 0x1e6445255c0>
model = Ridge( ) . fit( train_X, train_y_ln)
print ( 'intercept:' + str ( model. intercept_) )
sns. barplot( abs ( model. coef_) , continuous_feature_names)
intercept:4.671710857050353
<matplotlib.axes._subplots.AxesSubplot at 0x1e6440e4d30>
model = Lasso( ) . fit( train_X, train_y_ln)
print ( 'intercept:' + str ( model. intercept_) )
sns. barplot( abs ( model. coef_) , continuous_feature_names)
intercept:8.672182455497687
<matplotlib.axes._subplots.AxesSubplot at 0x1e644255400>
from sklearn. svm import SVC
from sklearn. tree import DecisionTreeRegressor
from sklearn. ensemble import RandomForestRegressor
from sklearn. ensemble import GradientBoostingRegressor
from sklearn. neural_network import MLPRegressor
from xgboost. sklearn import XGBRegressor
from lightgbm. sklearn import LGBMRegressor
models = [ LinearRegression( ) ,
DecisionTreeRegressor( ) ,
RandomForestRegressor( ) ,
GradientBoostingRegressor( ) ,
MLPRegressor( solver= 'lbfgs' , max_iter= 100 ) ,
XGBRegressor( n_estimators = 100 , objective= 'reg:squarederror' ) ,
LGBMRegressor( n_estimators = 100 ) ]
result = dict ( )
for model in models:
model_name = str ( model) . split( '(' ) [ 0 ]
scores = cross_val_score( model, X= train_X, y= train_y_ln, verbose= 0 , cv = 5 , scoring= make_scorer( mean_absolute_error) )
result[ model_name] = scores
print ( model_name + ' is finished' )
result = pd. DataFrame( result)
result. index = [ 'cv' + str ( x) for x in range ( 1 , 6 ) ]
result
objective = [ 'regression' , 'regression_l1' , 'mape' , 'huber' , 'fair' ]
num_leaves = [ 3 , 5 , 10 , 15 , 20 , 40 , 55 ]
max_depth = [ 3 , 5 , 10 , 15 , 20 , 40 , 55 ]
bagging_fraction = [ ]
feature_fraction = [ ]
drop_rate = [ ]
best_obj = dict ( )
for obj in objective:
model = LGBMRegressor( objective= obj)
score = np. mean( cross_val_score( model, X= train_X, y= train_y_ln, verbose= 0 , cv = 5 , scoring= make_scorer( mean_absolute_error) ) )
best_obj[ obj] = score
best_leaves = dict ( )
for leaves in num_leaves:
model = LGBMRegressor( objective= min ( best_obj. items( ) , key= lambda x: x[ 1 ] ) [ 0 ] , num_leaves= leaves)
score = np. mean( cross_val_score( model, X= train_X, y= train_y_ln, verbose= 0 , cv = 5 , scoring= make_scorer( mean_absolute_error) ) )
best_leaves[ leaves] = score
best_depth = dict ( )
for depth in max_depth:
model = LGBMRegressor( objective= min ( best_obj. items( ) , key= lambda x: x[ 1 ] ) [ 0 ] ,
num_leaves= min ( best_leaves. items( ) , key= lambda x: x[ 1 ] ) [ 0 ] ,
max_depth= depth)
score = np. mean( cross_val_score( model, X= train_X, y= train_y_ln, verbose= 0 , cv = 5 , scoring= make_scorer( mean_absolute_error) ) )
best_depth[ depth] = score
sns. lineplot( x= [ '0_initial' , '1_turning_obj' , '2_turning_leaves' , '3_turning_depth' ] , y= [ 0.143 , min ( best_obj. values( ) ) , min ( best_leaves. values( ) ) , min ( best_depth. values( ) ) ] )
from sklearn. model_selection import GridSearchCV
parameters = { 'objective' : objective , 'num_leaves' : num_leaves, 'max_depth' : max_depth}
model = LGBMRegressor( )
clf = GridSearchCV( model, parameters, cv= 5 )
clf = clf. fit( train_X, train_y)
clf. best_params_
{ 'max_depth' : 15 , 'num_leaves' : 55 , 'objective' : 'regression' }
model = LGBMRegressor( objective= 'regression' ,
num_leaves= 55 ,
max_depth= 15 )
np. mean( cross_val_score( model, X= train_X, y= train_y_ln, verbose= 0 , cv = 5 , scoring= make_scorer( mean_absolute_error) ) )
from bayes_opt import BayesianOptimization
def rf_cv ( num_leaves, max_depth, subsample, min_child_samples) :
val = cross_val_score(
LGBMRegressor( objective = 'regression_l1' ,
num_leaves= int ( num_leaves) ,
max_depth= int ( max_depth) ,
subsample = subsample,
min_child_samples = int ( min_child_samples)
) ,
X= train_X, y= train_y_ln, verbose= 0 , cv = 5 , scoring= make_scorer( mean_absolute_error)
) . mean( )
return 1 - val
rf_bo = BayesianOptimization(
rf_cv,
{
'num_leaves' : ( 2 , 100 ) ,
'max_depth' : ( 2 , 100 ) ,
'subsample' : ( 0.1 , 1 ) ,
'min_child_samples' : ( 2 , 100 )
}
)
rf_bo. maximize( )
print ( 1 - rf_bo. max [ 'target' ] )
plt. figure( figsize= ( 13 , 5 ) )
sns. lineplot( x= [ '0_origin' , '1_log_transfer' , '2_L1_&_L2' , '3_change_model' , '4_parameter_turning' ] , y= [ 1.36 , 0.19 , 0.19 , 0.14 , 0.13 ] )