泰坦尼克数据集下载
导入需要的库
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
加载数据集
train_file = './data/titanic/train.csv'
eval_file = './data/titanic/eval.csv'
train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)
print(train_df.head())
print(eval_df.head())
其中使用**pd.read_csv()函数读取csv文件;使用train_df.head()**函数展示前五行数据。
划分特征和标签
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')
print(y_train.head())
print(y_eval.head())
其中使用**train_df.pop(‘survived’)**函数将’survived’对应的列从train_df中移动到y_train 中。
计算数据集中的统计量
train_df.describe()
对乘客年龄进行统计
train_df.age.hist(bins=20)
bins=20是指将乘客年龄划分为20段。
对乘客性别进行统计
train_df.sex.value_counts().plot(kind='barh')
kind='barh’画横向柱状图;kind='barv’画纵向柱状图。
对乘客舱位进行统计
# 因为Dataframe本身存在class函数,所以此处不用train_df.class
train_df['class'].value_counts().plot(kind='barh')
分别对男乘客和女乘客的存活率进行统计
pd.concat([train_df, y_train],
axis=1).groupby('sex').survived.mean().plot(kind='barh')
其中**pd.concat([train_df, y_train], axis=1)**指将标签和特征合并。**groupby(‘sex’)**是指按照性别分类样本。
特征处理
对于离散性特征,需要先进行one-hot编码,再输入模型;对于连续性特征,可以直接输入模型。
a. 对特征进行分类
# 离散特征
catagotical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck',
'embark_town', 'alone']
# 连续特征
numeric_columns = ['age', 'fare']
# 建立列表存放处理后的特征
feature_columns = []
b. 处理离散特征
for catagotical_column in catagotical_columns:
vocab = train_df[catagotical_column].unique()
feature_columns.append(tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(catagotical_column, vocab)))
其中:
train_df[catagotical_column].unique() 用来获得离散性特征中的所有可能取到的值。如:
train_df['sex'].unique()
得到:
['male' 'female']
tf.feature_column.categorical_column_with_vocabulary_list(catagotical_column, vocab) 用于将每个字符串映射到一个整数。即将[‘male’ ‘female’]映射成[1 2]。
tf.feature_column.indicator_column用来做one-hot编码。
c. 处理连续特征
for numeric_column in numeric_columns:
feature_columns.append(tf.feature_column.numeric_column(numeric_column, dtype=tf.float32))
连续特征可以直接被当成输入。所以只需要用tf.feature_column.numeric_column 即可。
d. 打印feature_columns
feature_columns
得到:
[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)),
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)),
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
NumericColumn(key='fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]
构建dataset
def make_dataset(data_df, label_df, epochs=10, shuffle=True, batch_size=32):
dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
if shuffle:
dataset = dataset.shuffle(10000)
dataset = dataset.repeat(epochs).batch(batch_size)
return dataset
train_dataset = make_dataset(train_df, y_train, batch_size=5)
在这里要将data_df转换成字典形式,转换成字典形式后,dict(data_df)的key值为列名(‘sex’等),value值为数据值,这样才符合tf.data.Dataset.from_tensor_slices的形式。比如dataset = tf.data.Dataset.from_tensor_slices ( { “a”:np.array([1.0,2.0,3.0,4.0,5.0]), “b”:np.random.uniform(size=(5,2) ) } )
那么,函数会分别切分”a”中的数值以及”b”中的数值,最后总dataset中的一个元素就是类似于{ “a”:1.0, “b”:[0.9,0.1] }的形式。
打印train_dataset中的值
for x, y, in train_dataset.take(1):
print('x: ', x,'\n')
print('y: ', y,'\n')
得到:
x: {
'sex': <tf.Tensor: id=495, shape=(5,), dtype=string, numpy=array([b'male', b'male', b'male', b'male', b'male'], dtype=object)>,
'age': <tf.Tensor: id=487, shape=(5,), dtype=float64, numpy=array([28., 36., 29., 28., 28.])>,
'n_siblings_spouses': <tf.Tensor: id=493, shape=(5,), dtype=int32, numpy=array([0, 0, 0, 0, 0])>,
'parch': <tf.Tensor: id=494, shape=(5,), dtype=int32, numpy=array([0, 1, 0, 0, 0])>,
'fare': <tf.Tensor: id=492, shape=(5,), dtype=float64, numpy=array([ 0. , 512.3292, 9.5 , 8.05 , 7.7958])>,
'class': <tf.Tensor: id=489, shape=(5,), dtype=string, numpy=array([b'Second', b'First', b'Third', b'Third', b'Third'], dtype=object)>,
'deck': <tf.Tensor: id=490, shape=(5,), dtype=string, numpy=array([b'unknown', b'B', b'unknown', b'unknown', b'unknown'], dtype=object)>,
'embark_town': <tf.Tensor: id=491, shape=(5,), dtype=string, numpy=array([b'Southampton', b'Cherbourg', b'Southampton', b'Southampton', b'Southampton'], dtype=object)>,
'alone': <tf.Tensor: id=488, shape=(5,), dtype=string, numpy=array([b'y', b'n', b'y', b'y', b'y'], dtype=object)>
}
y: tf.Tensor([0 1 1 0 0], shape=(5,), dtype=int32)
将feature_columns应用到dataset中去
for x, y in train_dataset.take(1):
print(keras.layers.DenseFeatures(feature_columns)(x).numpy())
得到:
[[ 49. 1. 0. 0. 1. 0. 0. 1.
0. 0. 0. 0. 0. 0. 0. 1.
0. 0. 89.1042 1. 0. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0.
1. 0. ]
[ 15. 1. 0. 1. 0. 0. 1. 0.
0. 0. 0. 0. 0. 0. 0. 1.
0. 0. 14.4542 1. 0. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0.
0. 1. ]
[ 29. 0. 1. 0. 1. 0. 0. 0.
0. 0. 0. 1. 0. 0. 1. 0.
0. 0. 30. 0. 1. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0.
1. 0. ]
[ 32. 0. 1. 1. 0. 0. 1. 0.
0. 0. 0. 0. 0. 0. 0. 0.
1. 0. 7.75 0. 1. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0.
1. 0. ]
[ 24. 1. 0. 0. 1. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 1.
0. 0. 247.5208 0. 1. 0. 0. 0.
0. 0. 0. 1. 0. 0. 0. 0.
1. 0. ]]
其中:keras.layers.DenseFeatures(feature_columns)(x) 是指将feature_columns应用到x中去,即将字符串转化成数值。如分别只将年龄和性别应用到x中去:
for x, y in train_dataset.take(1):
age_column = feature_columns[7]
gender_column = feature_columns[0]
print(keras.layers.DenseFeatures(age_column)(x).numpy())
print(keras.layers.DenseFeatures(gender_column)(x).numpy())
得到:
[[23.]
[22.]
[51.]
[39.]
[19.]]
[[0. 1.]
[0. 1.]
[1. 0.]
[1. 0.]
[1. 0.]]
由此可见,对连续性特征不做处理,对离散型特征做one-hot编码。