TensorFlow预测燃油效率

环境 jupyter notebook tensorflow2.1.0 python3.7.5

import pathlib
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras as keras
from tensorflow.keras import layers as layers

dataset_path = keras.utils.get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
print(dataset_path)

/Users/xxxx/.keras/datasets/auto-mpg.data

column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']
#'每加伦汽油能距的英里数'，'汽缸'，'排量'，'马力'，'重量'，“加速”，“型号年份”，“来源”
raw_dataset=pd.read_csv(dataset_path,
                        names=column_names, # 表头
                        na_values='?', # 把未知名改为？
                        comment='\t', 
                        sep=' ', # 要使用的定界符
                        skipinitialspace=True) # 在定界符后跳过空格。

dataset = raw_dataset.copy()

dataset.tail() # 返回最后n行，n默认5

	MPG	Cylinders	Displacement	Horsepower	Weight	Acceleration	Model Year	Origin
393	27.0	4	140.0	86.0	2790.0	15.6	82	1
394	44.0	4	97.0	52.0	2130.0	24.6	82	2
395	32.0	4	135.0	84.0	2295.0	11.6	82	1
396	28.0	4	120.0	79.0	2625.0	18.6	82	1
397	31.0	4	119.0	82.0	2720.0	19.4	82	1

dataset.isna().sum() # 计算缺失值的个数

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

dataset = dataset.dropna() # 删除缺失值

origin=dataset.pop('Origin') # 弹出Origin标签，并用origin来获取dataset中的Origin的值

dataset['USA']=(origin==1)*1.0 # 如果origin为1，则USA的标签下为1.0
dataset['Europe']=(origin==2)*1.0 # 如果origin为2，则Europe的标签下为1.0
dataset['Japan']=(origin==3)*1.0 # 如果origin为3，则Japan的标签下为1.0

dataset.tail()

	MPG	Cylinders	Displacement	Horsepower	Weight	Acceleration	Model Year	USA	Europe
393	27.0	4	140.0	86.0	2790.0	15.6	82	1.0	0.0
394	44.0	4	97.0	52.0	2130.0	24.6	82	0.0	1.0
395	32.0	4	135.0	84.0	2295.0	11.6	82	1.0	0.0
396	28.0	4	120.0	79.0	2625.0	18.6	82	1.0	0.0
397	31.0	4	119.0	82.0	2720.0	19.4	82	1.0	0.0

train_dataset = dataset.sample(frac=0.8,random_state=0) #以随机数种子0在数据集中抽取80%
test_dataset = dataset.drop(train_dataset.index) # 在数据集中删除训练集作为测试集

sns.pairplot(train_dataset[["MPG", "Cylinders", "Displacement", "Weight"]],diag_kind='kde')

<seaborn.axisgrid.PairGrid at 0x10cd98110>

train_stats = train_dataset.describe()
train_stats.pop("MPG") # jupyter 交换环境弹出并输出输出MPG

count    314.000000
mean      23.310510
std        7.728652
min       10.000000
25%       17.000000
50%       22.000000
75%       28.950000
max       46.600000
Name: MPG, dtype: float64

train_stats = train_stats.transpose()

train_stats

	count	mean	std	min	25%	50%	75%	max
Cylinders	314.0	5.477707	1.699788	3.0	4.00	4.0	8.00	8.0
Displacement	314.0	195.318471	104.331589	68.0	105.50	151.0	265.75	455.0
Horsepower	314.0	104.869427	38.096214	46.0	76.25	94.5	128.00	225.0
Weight	314.0	2990.251592	843.898596	1649.0	2256.50	2822.5	3608.00	5140.0
Acceleration	314.0	15.559236	2.789230	8.0	13.80	15.5	17.20	24.8
Model Year	314.0	75.898089	3.675642	70.0	73.00	76.0	79.00	82.0
USA	314.0	0.624204	0.485101	0.0	0.00	1.0	1.00	1.0
Europe	314.0	0.178344	0.383413	0.0	0.00	0.0	0.00	1.0
Japan	314.0	0.197452	0.398712	0.0	0.00	0.0	0.00	1.0

train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')
# 这个标签是使用训练模型进行预测的值。

def norm(x):
    return (x - train_stats['mean']) / train_stats['std'] # 化成0-1正态分布

normed_train_data = norm(train_dataset) # 归一化
normed_test_data = norm(test_dataset) # 归一化

model = keras.models.Sequential()
model.add(layers.Dense(64,activation='relu',input_shape=[len(train_dataset.keys())]))
model.add(layers.Dense(64,activation='relu'))
model.add(layers.Dense(1))

model.compile(loss='mse',
              optimizer=tf.keras.optimizers.RMSprop(0.001),
              metrics=['mae', 'mse'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 64)                640       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
=================================================================
Total params: 4,865
Trainable params: 4,865
Non-trainable params: 0
_________________________________________________________________

example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)

print(example_result)

[[ 0.06187941]
 [ 0.16284567]
 [ 0.19416149]
 [ 0.3226478 ]
 [ 0.09883147]
 [ 0.00343724]
 [ 0.13330291]
 [ 0.62984717]
 [-0.05348695]
 [ 0.44078857]]

# 通过为每个完成的时期打印一个点来显示训练进度
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if (epoch%100==0):
            print(' ') # 每一百行换行
        print('.',end=' ')

history=model.fit(normed_train_data,
                  train_labels,
                  epochs=1000,
                  validation_split=0.2, # 把训练集的20%作为验证集
                  verbose=0,  # 不显示进度条
                  callbacks=[PrintDot()]) # 回调函数为PrintDot

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

hist=pd.DataFrame(history.history)
hist['epoch']=history.epoch # 增加epoch的标签

hist.tail()

	loss	mae	mse	val_loss	val_mae	val_mse	epoch
995	2.448885	0.975710	2.448885	9.030066	2.268713	9.030066	995
996	2.376843	0.999163	2.376843	9.096817	2.273271	9.096817	996
997	2.383884	0.992754	2.383883	9.657296	2.356696	9.657296	997
998	2.504148	1.021134	2.504148	9.152325	2.318949	9.152325	998
999	2.421463	0.947287	2.421463	9.146635	2.284075	9.146635	999

def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [MPG]')
  plt.plot(hist['epoch'], hist['mae'], # (x,y)
           label='Train Error') # 线段的名称即标签卡上的名称
  plt.plot(hist['epoch'], hist['val_mae'],
           label = 'Val Error')
  plt.ylim([0,5])
  plt.legend()

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error [$MPG^2$]')
  plt.plot(hist['epoch'], hist['mse'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mse'],
           label = 'Val Error')
  plt.ylim([0,20])
  plt.legend() # 打印标签卡
  plt.show()

plot_history(history) # 该图表显示在约100个epoch之后，误差非但没有改进，反而出现恶化。

train_dataset.keys()

Index(['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration',
       'Model Year', 'USA', 'Europe', 'Japan'],
      dtype='object')

model = keras.models.Sequential()
model.add(layers.Dense(64,activation='relu',input_shape=[len(train_dataset.keys())]))
model.add(layers.Dense(64,activation='relu'))
model.add(layers.Dense(1))

model.compile(loss='mse',
              optimizer=tf.keras.optimizers.RMSprop(0.001),
              metrics=['mae', 'mse'])

history=model.fit(normed_train_data,
                  train_labels,
                  epochs=1000,
                  validation_split=0.2,
                  verbose=0,
                  callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=10), 
                             # 当绝对变化值小于min_data，则退出，min_data默认为0
                             PrintDot()])

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

plot_history(history)

test_predictions = model.predict(normed_test_data).flatten() # 展平

plt.scatter(test_labels,test_predictions)
plt.xlabel('True values [MPG]')
plt.ylabel("Predictions [MPG]")
plt.axis('equal') # 等比例
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100,100],[-100,100]) # 画一条经过（-100，-100）与（100，100）的线段

error = test_predictions-test_labels
plt.hist(error, bins=25) # 有bins条数
plt.xlabel('Prediction Error [MPG]')
_ = plt.ylabel("Count")

谁唱江南断肠句

发布了81 篇原创文章 · 获赞 27 · 访问量 1万+

私信关注

TensorFlow预测燃油效率

猜你喜欢