笔记——数据归一化 scikit-learn中的Scaler

数据归一化

在这里插入图片描述
在这里插入图片描述

除了·边界比较明显的数据集(像素),一般用均值方差归一化。

在这里插入图片描述

测试数据集要用训练集的平均数和标准差进行归一化

在这里插入图片描述

import numpy as np
import matplotlib.pyplot as plt

最值归一化 normalization

x=np.random.randint(0,100,100)
x
array([84,  5,  7, 97, 16, 15, 64, 71, 55, 58, 12,  0, 73, 41, 27, 92, 97,
       21, 29, 69, 46,  7, 70, 68, 61, 59, 65,  2, 70, 30, 34, 45, 86, 29,
       17, 21, 41, 50,  5, 51,  3, 27, 68, 25, 53, 76, 15,  9, 16, 63, 62,
       65, 39, 78, 76, 82, 83, 67, 51,  6, 32, 30, 99, 56, 65, 80, 31, 12,
        4, 33, 54, 95, 63, 87, 62, 55, 86, 27, 84, 96, 35, 54, 64, 88,  8,
       36, 99, 27, 50, 53, 95, 56, 20, 70, 15, 70, 27, 40,  4, 54])
(x-np.min(x))/(np.max(x)-np.min(x))
array([0.84848485, 0.05050505, 0.07070707, 0.97979798, 0.16161616,
       0.15151515, 0.64646465, 0.71717172, 0.55555556, 0.58585859,
       0.12121212, 0.        , 0.73737374, 0.41414141, 0.27272727,
       0.92929293, 0.97979798, 0.21212121, 0.29292929, 0.6969697 ,
       0.46464646, 0.07070707, 0.70707071, 0.68686869, 0.61616162,
       0.5959596 , 0.65656566, 0.02020202, 0.70707071, 0.3030303 ,
       0.34343434, 0.45454545, 0.86868687, 0.29292929, 0.17171717,
       0.21212121, 0.41414141, 0.50505051, 0.05050505, 0.51515152,
       0.03030303, 0.27272727, 0.68686869, 0.25252525, 0.53535354,
       0.76767677, 0.15151515, 0.09090909, 0.16161616, 0.63636364,
       0.62626263, 0.65656566, 0.39393939, 0.78787879, 0.76767677,
       0.82828283, 0.83838384, 0.67676768, 0.51515152, 0.06060606,
       0.32323232, 0.3030303 , 1.        , 0.56565657, 0.65656566,
       0.80808081, 0.31313131, 0.12121212, 0.04040404, 0.33333333,
       0.54545455, 0.95959596, 0.63636364, 0.87878788, 0.62626263,
       0.55555556, 0.86868687, 0.27272727, 0.84848485, 0.96969697,
       0.35353535, 0.54545455, 0.64646465, 0.88888889, 0.08080808,
       0.36363636, 1.        , 0.27272727, 0.50505051, 0.53535354,
       0.95959596, 0.56565657, 0.2020202 , 0.70707071, 0.15151515,
       0.70707071, 0.27272727, 0.4040404 , 0.04040404, 0.54545455])
X=np.random.randint(0,100,(50,2))
X[:10,:]
array([[55, 33],
       [51, 53],
       [40, 14],
       [10, 24],
       [90, 36],
       [76, 34],
       [45, 48],
       [86, 89],
       [88, 68],
       [ 4, 39]])
X=np.array(X,dtype=float)
X[:10,:]
array([[55., 33.],
       [51., 53.],
       [40., 14.],
       [10., 24.],
       [90., 36.],
       [76., 34.],
       [45., 48.],
       [86., 89.],
       [88., 68.],
       [ 4., 39.]])
X[:,0]=(X[:,0]-np.min(X[:,0]))/(np.max(X[:,0])-np.min(X[:,0]))
X[:,1]=(X[:,1]-np.min(X[:,1]))/(np.max(X[:,1])-np.min(X[:,1]))
X[:10,]
array([[0.55102041, 0.33333333],
       [0.51020408, 0.54166667],
       [0.39795918, 0.13541667],
       [0.09183673, 0.23958333],
       [0.90816327, 0.36458333],
       [0.76530612, 0.34375   ],
       [0.44897959, 0.48958333],
       [0.86734694, 0.91666667],
       [0.8877551 , 0.69791667],
       [0.03061224, 0.39583333]])
plt.scatter(X[:,0],X[:,1])
<matplotlib.collections.PathCollection at 0x2070630a988>

在这里插入图片描述

数据均值

np.mean(X[:,0])
0.540204081632653
np.mean(X[:,1])
0.5147916666666666

数据标准差

np.std(X[:,0])
0.28443891736063087
np.std(X[:,1])
0.28634346181639975

均值方差归一化

X2=np.random.randint(0,100,(50,2))
X2=np.array(X2,dtype=float)
X2[:,0]=(X2[:,0]-np.mean(X2[:,0]))/np.std(X2[:,0])
X2[:,1]=(X2[:,1]-np.mean(X2[:,1]))/np.std(X2[:,1])
X2[:10,]
array([[-1.24848758, -1.00045396],
       [ 0.55550544, -1.47236621],
       [ 1.39529529, -0.60719376],
       [ 1.36419196, -0.41056365],
       [ 0.83543539,  0.4546088 ],
       [ 1.11536534,  0.49393482],
       [-1.27959091,  1.47708533],
       [-1.27959091, -0.17460753],
       [ 0.55550544, -1.079106  ],
       [-0.78193766, -0.84314988]])
plt.scatter(X2[:,0],X2[:,1])
<matplotlib.collections.PathCollection at 0x207064e8a48>

在这里插入图片描述

np.mean(X2[:,0])
-6.106226635438361e-17
np.std(X2[:,0])
1.0
np.mean(X2[:1])
-1.1244707714628714
np.std(X2[:1])
0.12401681156731514

scikit_learn中的scater

import numpy as np
from sklearn import datasets 

iris=datasets.load_iris()

X=iris.data
y=iris.target
X[:10,]
array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=666)

scikit-learn 中的StandardScater

from sklearn.preprocessing import StandardScaler
stdScater=StandardScaler()
stdScater.fit(X_train)
StandardScaler(copy=True, with_mean=True, with_std=True)
stdScater.mean_
array([5.83416667, 3.08666667, 3.70833333, 1.17      ])

scale_表示标准差

stdScater.scale_
array([0.81019502, 0.44327067, 1.76401924, 0.75317107])

进行数据归一transform

X_train=stdScater.transform(X_train)
stdScater.fit(X_test)
X_test_std=stdScater.transform(X_test)
X_train[:10,]
array([[-0.90616043,  0.93246262, -1.30856471, -1.28788802],
       [-1.15301457, -0.19551636, -1.30856471, -1.28788802],
       [-0.16559799, -0.64670795,  0.22203084,  0.17260355],
       [ 0.45153738,  0.70686683,  0.95898425,  1.50032315],
       [-0.90616043, -1.32349533, -0.40154513, -0.09294037],
       [ 1.43895396,  0.25567524,  0.56216318,  0.30537551],
       [ 0.3281103 , -1.09789954,  1.0723617 ,  0.30537551],
       [ 2.1795164 , -0.19551636,  1.63924894,  1.23477923],
       [-0.78273335,  2.2860374 , -1.25187599, -1.42065998],
       [ 0.45153738, -2.00028272,  0.44878573,  0.43814747]])
X_test_std[:10,]
array([[-0.31739042,  0.16012815,  0.3146853 ,  0.23680294],
       [-0.09068298, -0.37363236,  0.66219054,  1.3992901 ],
       [-0.99751275, -1.70803364, -0.38032518, -0.40902326],
       [-0.09068298, -0.64051262,  0.66219054,  0.7534639 ],
       [-1.45092764,  0.42700841, -1.42284091, -1.44234518],
       [-0.43074414, -1.17427313,  0.0250976 , -0.02152754],
       [-0.2040367 , -0.37363236,  0.3146853 , -0.02152754],
       [ 0.70279308,  0.16012815,  0.72010808,  0.88262914],
       [ 0.47608563, -1.70803364,  0.25676776, -0.02152754],
       [-0.43074414, -0.90739287,  0.25676776, -0.15069278]])

KNN进行分类

from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

必须用归一化处理后的测试数据集X_test_std进行预测

knn_clf.score(X_test_std,y_test)
0.9666666666666667
knn_clf.score(X_test,y_test)
0.3333333333333333

MinMaxScater

import numpy as np
from sklearn import datasets 

iris=datasets.load_iris()

X=iris.data
y=iris.target

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=666)
from sklearn.preprocessing import MinMaxScaler
minmaxscater=MinMaxScaler()
minmaxscater.fit(X_train)
minmaxscater.fit(X_test)
X_train=minmaxscater.transform(X_train)
Y_test_std=minmaxscater.transform(X_test)
from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train,y_train)
knn_clf.score(X_test_std,y_test)
0.9666666666666667
knn_clf.score(X_test,y_test)
0.3333333333333333

自己瞎写

for i in range(X.shape[1]):
    X_train[:,i]=(X_train[:,i]-np.mean(X_train[:,i]))/np.std(X_train[:,i])
for i in range(X.shape[1]):
    X_test[:,i]=(X_test[:,i]-np.mean(X_train[:,i]))/np.std(X_train[:,i])
X_train[:10,]
array([[-0.90616043,  0.93246262, -1.30856471, -1.28788802],
       [-1.15301457, -0.19551636, -1.30856471, -1.28788802],
       [-0.16559799, -0.64670795,  0.22203084,  0.17260355],
       [ 0.45153738,  0.70686683,  0.95898425,  1.50032315],
       [-0.90616043, -1.32349533, -0.40154513, -0.09294037],
       [ 1.43895396,  0.25567524,  0.56216318,  0.30537551],
       [ 0.3281103 , -1.09789954,  1.0723617 ,  0.30537551],
       [ 2.1795164 , -0.19551636,  1.63924894,  1.23477923],
       [-0.78273335,  2.2860374 , -1.25187599, -1.42065998],
       [ 0.45153738, -2.00028272,  0.44878573,  0.43814747]])
X_test[:10]
array([[5.6, 3. , 4.5, 1.5],
       [5.8, 2.8, 5.1, 2.4],
       [5. , 2.3, 3.3, 1. ],
       [5.8, 2.7, 5.1, 1.9],
       [4.6, 3.1, 1.5, 0.2],
       [5.5, 2.5, 4. , 1.3],
       [5.7, 2.8, 4.5, 1.3],
       [6.5, 3. , 5.2, 2. ],
       [6.3, 2.3, 4.4, 1.3],
       [5.5, 2.6, 4.4, 1.2]])
X_train_mean=np.array([np.mean(X_train[:,i]) for i in range (X.shape[1])])
X_train_std=np.array([np.std(X_train[:10,i]) for i in range (X.shape[1])])
X_train_mean
array([ 2.07241631e-16,  4.81096644e-16, -1.03620816e-16,  3.70074342e-16])
X_train_std
array([1.04213681, 1.18109687, 1.02159309, 0.97603196])

猜你喜欢

转载自blog.csdn.net/chairon/article/details/107234310