无监督学习与预处理3-3预处理与缩放

#应用数据变换
import mglearn
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import make_moons
from sklearn.datasets import make_blobs
from sklearn.datasets import make_circles
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from numpy.core.umath_tests import inner1d
from mpl_toolkits.mplot3d import Axes3D,axes3d
from numpy.core.umath_tests import inner1d

mglearn.plots.plot_scaling()

在这里插入图片描述

cancer = load_breast_cancer()
x_train,x_test,y_train,y_test = train_test_split(cancer.data,cancer.target,random_state=1)
print(x_train.shape)
print(x_test.shape)

(426, 30)
(143, 30)

scaler = MinMaxScaler()  #移动数据,使其在0~1之间
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
print("trainformed shape:{}".format(x_train_scaled.shape))
print("pre-feature mininum before:{}".format(x_train.min(axis=0)))
print("pre-feature maxinum before:{}".format(x_train.max(axis=0)))
print("pre-feature mininum after:{}".format(x_train_scaled.min(axis=0)))
print("pre-feature maxinum after:{}".format(x_train_scaled.max(axis=0)))
x_test_scaled = scaler.transform(x_test)
print("pre-feature mininum after:{}".format(x_test_scaled.min(axis=0)))
print("pre-feature maxinum after:{}".format(x_test_scaled.max(axis=0)))

trainformed shape:(426, 30)
pre-feature mininum before:[6.981e+00 9.710e+00 4.379e+01 1.435e+02 5.263e-02 1.938e-02 0.000e+00
0.000e+00 1.060e-01 5.024e-02 1.153e-01 3.602e-01 7.570e-01 6.802e+00
1.713e-03 2.252e-03 0.000e+00 0.000e+00 9.539e-03 8.948e-04 7.930e+00
1.202e+01 5.041e+01 1.852e+02 7.117e-02 2.729e-02 0.000e+00 0.000e+00
1.566e-01 5.521e-02]
pre-feature maxinum before:[2.811e+01 3.928e+01 1.885e+02 2.501e+03 1.634e-01 2.867e-01 4.268e-01
2.012e-01 3.040e-01 9.575e-02 2.873e+00 4.885e+00 2.198e+01 5.422e+02
3.113e-02 1.354e-01 3.960e-01 5.279e-02 6.146e-02 2.984e-02 3.604e+01
4.954e+01 2.512e+02 4.254e+03 2.226e-01 9.379e-01 1.170e+00 2.910e-01
5.774e-01 1.486e-01]
pre-feature mininum after:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
pre-feature maxinum after:[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
pre-feature mininum after:[ 0.0336031 0.0226581 0.03144219 0.01141039 0.14128374 0.04406704 0. 0. 0.1540404 -0.00615249 -0.00137796 0.00594501
0.00430665 0.00079567 0.03919502 0.0112206 0. 0.
-0.03191387 0.00664013 0.02660975 0.05810235 0.02031974 0.00943767
0.1094235 0.02637792 0. 0. -0.00023764 -0.00182032]
pre-feature maxinum after:[0.9578778 0.81501522 0.95577362 0.89353128 0.81132075 1.21958701
0.87956888 0.9333996 0.93232323 1.0371347 0.42669616 0.49765736
0.44117231 0.28371044 0.48703131 0.73863671 0.76717172 0.62928585
1.33685792 0.39057253 0.89612238 0.79317697 0.84859804 0.74488793
0.9154725 1.13188961 1.07008547 0.92371134 1.20532319 1.63068851]

x,_ = make_blobs(n_samples=50,centers=5,random_state=4,cluster_std=2)
x_train,x_test = train_test_split(x,random_state=5,test_size=.1)
fig,axes = plt.subplots(1,3,figsize=(13,4))

axes[0].scatter(x_train[:,0],x_train[:,1],c=mglearn.cm2(0),label='training set',s=60)
axes[0].scatter(x_test[:,0],x_test[:,1],marker='^',c=mglearn.cm2(1),label='test set',s=60)
axes[0].legend(loc='upper left')
axes[0].set_title('original data')
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)
axes[1].scatter(x_train_scaled[:,0],x_train_scaled[:,1],c=mglearn.cm2(0),label='training set',s=60)
axes[1].scatter(x_test_scaled[:,0],x_test_scaled[:,1],marker='^',c=mglearn.cm2(1),label='test set',s=60)
axes[1].set_title('scaled data')
test_scaler = MinMaxScaler()
test_scaler.fit(x_test)
x_test_scaled_badly = test_scaler.transform(x_test)
axes[2].scatter(x_train_scaled[:,0],x_train_scaled[:,1],c=mglearn.cm2(0),label='training set',s=60)
axes[2].scatter(x_test_scaled[:,0],x_test_scaled[:,1],marker='^',c=mglearn.cm2(1),label='test set',s=60)
axes[2].set_title('improperly scaled data')
for ax in axes:
    ax.set_xlabel('feature 0')
    ax.set_ylabel('feature 1')

在这里插入图片描述

x_train,x_test,y_train,y_test = train_test_split(cancer.data,cancer.target,random_state=0)
svm = SVC(C=100)
svm.fit(x_train,y_train)
print('train set accurary:{:.2f}'.format(svm.score(x_train,y_train)))
print('test set accurary:{:.2f}'.format(svm.score(x_test,y_test)))
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)
svm.fit(x_train_scaled,y_train)
print('scaled test set accurary:{:.2f}'.format(svm.score(x_test_scaled,y_test)))

train set accurary:1.00
test set accurary:0.63
scaled test set accurary:0.97

scaler = StandardScaler()  #零均值和单位方差
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)
svm.fit(x_train_scaled,y_train)
print('svm test set accurary:{:.2f}'.format(svm.score(x_test_scaled,y_test)))

svm test set accurary:0.96

发布了65 篇原创文章 · 获赞 13 · 访问量 2万+

猜你喜欢

转载自blog.csdn.net/heroybc/article/details/103109620