import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
% matplotlib inline
data = pd. read_csv( './data/abalone.txt' ,
header= None ,
sep= '\t'
)
data. head( )
0
1
2
3
4
5
6
7
8
0
1
0.455
0.365
0.095
0.5140
0.2245
0.1010
0.150
15
1
1
0.350
0.265
0.090
0.2255
0.0995
0.0485
0.070
7
2
-1
0.530
0.420
0.135
0.6770
0.2565
0.1415
0.210
9
3
1
0.440
0.365
0.125
0.5160
0.2155
0.1140
0.155
10
4
0
0.330
0.255
0.080
0.2050
0.0895
0.0395
0.055
7
data. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 0 4177 non-null int64
1 1 4177 non-null float64
2 2 4177 non-null float64
3 3 4177 non-null float64
4 4 4177 non-null float64
5 5 4177 non-null float64
6 6 4177 non-null float64
7 7 4177 non-null float64
8 8 4177 non-null int64
dtypes: float64(7), int64(2)
memory usage: 293.8 KB
data. describe( [ 0.01 , 0.99 ] ) . T
count
mean
std
min
1%
50%
99%
max
0
4177.0
0.052909
0.822240
-1.0000
-1.00000
0.0000
1.00000
1.0000
1
4177.0
0.523992
0.120093
0.0750
0.19500
0.5450
0.73500
0.8150
2
4177.0
0.407881
0.099240
0.0550
0.14000
0.4250
0.58000
0.6500
3
4177.0
0.139516
0.041827
0.0000
0.04500
0.1400
0.22000
1.1300
4
4177.0
0.828742
0.490389
0.0020
0.03576
0.7995
2.14442
2.8255
5
4177.0
0.359367
0.221963
0.0010
0.01350
0.3360
0.99778
1.4880
6
4177.0
0.180594
0.109614
0.0005
0.00788
0.1710
0.47610
0.7600
7
4177.0
0.238831
0.139203
0.0015
0.01038
0.2340
0.62000
1.0050
8
4177.0
9.933684
3.224169
1.0000
4.00000
9.0000
20.00000
29.0000
data. iloc[ : , - 1 ] . value_counts( )
9 689
10 634
8 568
11 487
7 391
12 267
6 259
13 203
14 126
5 115
15 103
16 67
17 58
4 57
18 42
19 32
20 26
3 15
21 14
23 9
22 6
27 2
24 2
1 1
26 1
29 1
2 1
25 1
Name: 8, dtype: int64
X = data. iloc[ : , : - 1 ] . copy( )
y = data. iloc[ : , - 1 ] . copy( )
from sklearn. model_selection import train_test_split
from sklearn. linear_model import LinearRegression, Lasso, Ridge
from sklearn. neighbors import KNeighborsRegressor
from sklearn. tree import DecisionTreeRegressor
from sklearn. ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
C:\Anaconda\lib\site-packages\xgboost\compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
from pandas import MultiIndex, Int64Index
from sklearn. preprocessing import StandardScaler
s_X = StandardScaler( ) . fit_transform( X)
s_X = pd. DataFrame( data = s_X, columns= X. columns)
s_X
0
1
2
3
4
5
6
7
0
1.151980
-0.574558
-0.432149
-1.064424
-0.641898
-0.607685
-0.726212
-0.638217
1
1.151980
-1.448986
-1.439929
-1.183978
-1.230277
-1.170910
-1.205221
-1.212987
2
-1.280690
0.050033
0.122130
-0.107991
-0.309469
-0.463500
-0.356690
-0.207139
3
1.151980
-0.699476
-0.432149
-0.347099
-0.637819
-0.648238
-0.607600
-0.602294
4
-0.064355
-1.615544
-1.540707
-1.423087
-1.272086
-1.215968
-1.287337
-1.320757
...
...
...
...
...
...
...
...
...
4172
-1.280690
0.341509
0.424464
0.609334
0.118813
0.047908
0.532900
0.073062
4173
1.151980
0.549706
0.323686
-0.107991
0.279929
0.358808
0.309362
0.155685
4174
1.151980
0.632985
0.676409
1.565767
0.708212
0.748559
0.975413
0.496955
4175
-1.280690
0.841182
0.777187
0.250672
0.541998
0.773341
0.733627
0.410739
4176
1.151980
1.549052
1.482634
1.326659
2.283681
2.640993
1.787449
1.840481
4177 rows × 8 columns
X_train, X_test, y_train, y_test = train_test_split( s_X, y, test_size = 0.2 , random_state= 19 )
RF = RandomForestRegressor( n_estimators = 500 )
RF. fit( X_train, y_train)
RF. score( X_test, y_test)
0.5040997932157716
lr = LinearRegression( )
lr. fit( X_train, y_train)
lr. score( X_test, y_test)
0.46853356891462006
knn = KNeighborsRegressor( )
knn. fit( X_train, y_train)
knn. score( X_test, y_test)
0.4416450520111813
dt = DecisionTreeRegressor( )
dt. fit( X_train, y_train)
dt. score( X_test, y_test)
0.054795460519916794
ada = AdaBoostRegressor(
base_estimator = DecisionTreeRegressor( max_depth= 3 ) ,
n_estimators= 50 ,
learning_rate= 1.0
)
ada. fit( X_train, y_train)
ada. score( X_test, y_test)
0.2854128443586734
gbdt = GradientBoostingRegressor( )
gbdt. fit( X_train, y_train)
gbdt. score( X_test, y_test)
0.49482544308232923
xgb = XGBRegressor( ) . fit( X_train, y_train)
xgb. score( X_test, y_test)
C:\Anaconda\lib\site-packages\xgboost\data.py:208: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
from pandas import MultiIndex, Int64Index
0.44712420076610004