声明:内容非原创,代码来自葁sir
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
% matplotlib inline
seeds = pd. read_csv( 'data/seeds.csv' , sep = '\t' , header = None )
seeds. head( )
0
1
2
3
4
5
6
7
0
15.26
14.84
0.8710
5.763
3.312
2.221
5.220
Kama
1
14.88
14.57
0.8811
5.554
3.333
1.018
4.956
Kama
2
14.29
14.09
0.9050
5.291
3.337
2.699
4.825
Kama
3
13.84
13.94
0.8955
5.324
3.379
2.259
4.805
Kama
4
16.14
14.99
0.9034
5.658
3.562
1.355
5.175
Kama
seeds[ 7 ] . value_counts( )
Kama 70
Rosa 70
Canadian 70
Name: 7, dtype: int64
seeds[ 7 ] . value_counts( ) . plot( kind = 'bar' )
<AxesSubplot:>
import seaborn as sns
sns. set ( )
from sklearn. model_selection import train_test_split
from sklearn. linear_model import Lasso, RidgeClassifier
from sklearn. neighbors import KNeighborsClassifier
from sklearn. tree import DecisionTreeClassifier
from sklearn. preprocessing import MinMaxScaler, StandardScaler
X = seeds. iloc[ : , : 7 ] . copy( )
X. shape
(210, 7)
X
0
1
2
3
4
5
6
0
15.26
14.84
0.8710
5.763
3.312
2.221
5.220
1
14.88
14.57
0.8811
5.554
3.333
1.018
4.956
2
14.29
14.09
0.9050
5.291
3.337
2.699
4.825
3
13.84
13.94
0.8955
5.324
3.379
2.259
4.805
4
16.14
14.99
0.9034
5.658
3.562
1.355
5.175
...
...
...
...
...
...
...
...
205
12.19
13.20
0.8783
5.137
2.981
3.631
4.870
206
11.23
12.88
0.8511
5.140
2.795
4.325
5.003
207
13.20
13.66
0.8883
5.236
3.232
8.315
5.056
208
11.84
13.21
0.8521
5.175
2.836
3.598
5.044
209
12.30
13.34
0.8684
5.243
2.974
5.637
5.063
210 rows × 7 columns
y = seeds. iloc[ : , - 1 ] . copy( )
y. shape
(210,)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size= 0.2 , random_state= 1 )
def knn_score ( k, X, y) :
knn = KNeighborsClassifier( n_neighbors = k)
scores = [ ]
train_scores = [ ]
for i in range ( 100 ) :
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size= 0.2 , random_state= 1 )
knn. fit( X_train, y_train)
scores. append( knn. score( X_test, y_test) )
train_scores. append( knn. score( X_train, y_train) )
return np. array( scores) . mean( ) , np. array( train_scores) . mean( )
result_dict = {
}
k_list = [ 1 , 3 , 5 , 7 , 9 , 11 ]
for k in k_list:
score, train_score = knn_score( k, X, y)
result_dict[ k] = [ score, train_score]
result_dict
{1: [0.9047619047619047, 1.0],
3: [0.9047619047619047, 0.9642857142857139],
5: [0.8571428571428572, 0.9285714285714287],
7: [0.8571428571428572, 0.9345238095238096],
9: [0.8809523809523812, 0.9226190476190478],
11: [0.8809523809523812, 0.9226190476190478]}
pd. DataFrame( result_dict) . T
0
1
1
0.904762
1.000000
3
0.904762
0.964286
5
0.857143
0.928571
7
0.857143
0.934524
9
0.880952
0.922619
11
0.880952
0.922619
result = pd. DataFrame( result_dict) . T. copy( )
result. columns = [ 'Test' , 'Train' ]
result
Test
Train
1
0.904762
1.000000
3
0.904762
0.964286
5
0.857143
0.928571
7
0.857143
0.934524
9
0.880952
0.922619
11
0.880952
0.922619
result. plot( )
plt. xticks( k_list)
plt. show( )
进阶版
X. shape
(210, 7)
X. describe( ) . T
count
mean
std
min
25%
50%
75%
max
0
210.0
14.847524
2.909699
10.5900
12.27000
14.35500
17.305000
21.1800
1
210.0
14.559286
1.305959
12.4100
13.45000
14.32000
15.715000
17.2500
2
210.0
0.870999
0.023629
0.8081
0.85690
0.87345
0.887775
0.9183
3
210.0
5.628533
0.443063
4.8990
5.26225
5.52350
5.979750
6.6750
4
210.0
3.258605
0.377714
2.6300
2.94400
3.23700
3.561750
4.0330
5
210.0
3.700201
1.503557
0.7651
2.56150
3.59900
4.768750
8.4560
6
210.0
5.408071
0.491480
4.5190
5.04500
5.22300
5.877000
6.5500
def standard_X ( X) :
X_copy = X. copy( )
for col_name in X_copy. columns:
col_data = X_copy[ [ col_name] ]
stand_data = StandardScaler( ) . fit_transform( col_data. values)
X_copy[ col_name] = stand_data
return X_copy
standard_X( X) . describe( [ 0.01 , 0.25 , 0.5 , 0.75 , 0.99 ] ) . T
count
mean
std
min
1%
25%
50%
75%
99%
max
0
210.0
-5.392512e-17
1.002389
-1.466714
-1.397504
-0.887955
-0.169674
0.846599
2.072913
2.181534
1
210.0
9.146123e-17
1.002389
-1.649686
-1.474607
-0.851433
-0.183664
0.887069
2.023505
2.065260
2
210.0
1.322091e-15
1.002389
-2.668236
-2.588824
-0.598079
0.103993
0.711677
1.678118
2.006586
3
210.0
-2.182910e-15
1.002389
-1.650501
-1.464372
-0.828682
-0.237628
0.794595
2.154459
2.367533
4
210.0
-2.030122e-16
1.002389
-1.668209
-1.634930
-0.834907
-0.057335
0.804496
1.936725
2.055112
5
210.0
-3.679596e-16
1.002389
-1.956769
-1.857934
-0.759148
-0.067469
0.712379
2.519905
3.170590
6
210.0
-1.337554e-16
1.002389
-1.813288
-1.633810
-0.740495
-0.377459
0.956394
2.130797
2.328998
查看数据分布
经过对标准化数据describe查看99分位数 发现标签为2和5的两个列 有较大差距
stand_X = standard_X( X)
for col_name in stand_X. columns:
sns. distplot( stand_X[ col_name] )
plt. title( col_name)
plt. show( )
分箱操作
10 3000 5000 10000000
以5000为分割点 分割出高收入 低收入 进行映射 (减少数据之间的差异)
X[ 0 ] = pd. cut( X[ 0 ] , bins = 5 , labels = [ 0 , 1 , 2 , 3 , 4 ] )
X[ 0 ]
0 2
1 2
2 1
3 1
4 2
..
205 0
206 0
207 1
208 0
209 0
Name: 0, Length: 210, dtype: category
Categories (5, int64): [0 < 1 < 2 < 3 < 4]
sns. countplot( X[ 0 ] )
C:\Anaconda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
<AxesSubplot:xlabel='0', ylabel='count'>
for col_name in X. columns:
X[ col_name] = pd. cut( X[ col_name] , bins = 5 , labels = [ 0 , 1 , 2 , 3 , 4 ] )
X
0
1
2
3
4
5
6
0
2
2
2
2
2
0
1
1
2
2
3
1
2
0
1
2
1
1
4
1
2
1
0
3
1
1
3
1
2
0
0
4
2
2
4
2
3
0
1
...
...
...
...
...
...
...
...
205
0
0
3
0
1
1
0
206
0
0
1
0
0
2
1
207
1
1
3
0
2
4
1
208
0
0
1
0
0
1
1
209
0
0
2
0
1
3
1
210 rows × 7 columns
knn = KNeighborsClassifier( )
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2 , random_state = 1 )
knn. fit( X_train, y_train)
KNeighborsClassifier()
knn. score( X_train, y_train)
0.9166666666666666
knn. score( X_test, y_test)
0.9523809523809523