【Kaggle实战】泰坦尼克号生存人数预测

数据地址:https://www.kaggle.com/c/titanic/data

版本说明:python 3.6 + tensorflow 1.9

项目结构图:

【data】

    1.train.csv 训练集 

    2.test.csv 测试集(不含预测结果) 

    3.gender_...csv 测试集的预测结果

             

【data_process.py 数据预处理 人工选择特征】

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np


def trainDataProcess(filepath):
    """
    训练数据预处理
    :param filepath: 
    :return: data_train, data_target
    """
    # 读训练集
    data = pd.read_csv(filepath)

    # 人工选特征 已去掉无用特征
    data = data[
        ['Survived',  # 是否获救(0/1)
         'Pclass',  # 客舱等级(1/2/3)
         'Sex',  # 性别
         'Age',  # 年龄
         'SibSp',  # 船上兄妹对象数
         'Parch',  # 船上爹妈儿女数
         'Fare',  # 船票价格
         'Cabin',  # 客舱号
         'Embarked'  # 登船口
         ]
    ]

    # 空白值(NA/NaN)的处理 以及String类型字段的数值化
    data['Age'] = data['Age'].fillna(data['Age'].mean())  # Age:空白值填入平均年龄
    data['Cabin'] = pd.factorize(data.Cabin)[0]  # Cabin:元组第0项为数值化的值
    data.fillna(0, inplace=True)  # 其他的空白值统统填0
    data['Sex'] = [1 if x == 'male' else 0 for x in data.Sex]

    # Pclass:避免出现 "2等票=1等票*2" 的线性关系 对三个票等级进行one-hot编码 加三个新字段 并删掉原字段
    data['p1'] = np.array(data['Pclass'] == 1).astype(np.int32)  # p1:0(非1等座) 1(是1等座)
    data['p2'] = np.array(data['Pclass'] == 2).astype(np.int32)  # p2:0(非2等座) 1(是2等座)
    data['p3'] = np.array(data['Pclass'] == 3).astype(np.int32)  # p3:0(非3等座) 1(是3等座)
    del data['Pclass']
    # Embarked:处理方法同上
    data['e1'] = np.array(data['Embarked'] == 'S').astype(np.int32)
    data['e2'] = np.array(data['Embarked'] == 'C').astype(np.int32)
    data['e3'] = np.array(data['Embarked'] == 'Q').astype(np.int32)
    del data['Embarked']

    # print(data[['p1','Cabin']])  # 去数据用双中括号

    # 预处理后的数据字段
    data_train = data[
        [
         'p1','p2','p3',  # 客舱等级(1/2/3)
         'Sex',  # 性别
         'Age',  # 年龄
         'SibSp',  # 船上兄妹对象数
         'Parch',  # 船上爹妈儿女数
         'Fare',  # 船票价格
         'Cabin',  # 客舱号
         'e1','e2','e3'  # 登船口
         ]
    ]  # 输入:891*12

    data_target = data['Survived'].values.reshape(len(data),1)  # 输出891*1 (列向量)
    return data_train,data_target


def testDataProcess(filepath):
    """
    测试数据预处理
    :param filepath: 
    :return: data_test
    """
    # 读训练集
    data = pd.read_csv(filepath)

    # 人工选特征 已去掉无用特征
    data = data[
        [
         'Pclass',  # 客舱等级(1/2/3)
         'Sex',  # 性别
         'Age',  # 年龄
         'SibSp',  # 船上兄妹对象数
         'Parch',  # 船上爹妈儿女数
         'Fare',  # 船票价格
         'Cabin',  # 客舱号
         'Embarked'  # 登船口
         ]
    ]

    # 空白值(NA/NaN)的处理 以及String类型字段的数值化
    data['Age'] = data['Age'].fillna(data['Age'].mean())  # Age:空白值填入平均年龄
    data['Cabin'] = pd.factorize(data.Cabin)[0]  # Cabin:元组第0项为数值化的值
    data.fillna(0, inplace=True)  # 其他的空白值统统填0
    data['Sex'] = [1 if x == 'male' else 0 for x in data.Sex]

    # Pclass:避免出现 "2等票=1等票*2" 的线性关系 对三个票等级进行one-hot编码 加三个新字段 并删掉原字段
    data['p1'] = np.array(data['Pclass'] == 1).astype(np.int32)  # p1:0(非1等座) 1(是1等座)
    data['p2'] = np.array(data['Pclass'] == 2).astype(np.int32)  # p2:0(非2等座) 1(是2等座)
    data['p3'] = np.array(data['Pclass'] == 3).astype(np.int32)  # p3:0(非3等座) 1(是3等座)
    del data['Pclass']
    # Embarked:处理方法同上
    data['e1'] = np.array(data['Embarked'] == 'S').astype(np.int32)
    data['e2'] = np.array(data['Embarked'] == 'C').astype(np.int32)
    data['e3'] = np.array(data['Embarked'] == 'Q').astype(np.int32)
    del data['Embarked']

    # 预处理后的数据字段
    data_test = data[
        [
         'p1','p2','p3',  # 客舱等级(1/2/3)
         'Sex',  # 性别
         'Age',  # 年龄
         'SibSp',  # 船上兄妹对象数
         'Parch',  # 船上爹妈儿女数
         'Fare',  # 船票价格
         'Cabin',  # 客舱号
         'e1','e2','e3'  # 登船口
         ]
    ]  # 输入:891*12

    return data_test

【network.py 构建训练网络】

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import tensorflow as tf
import numpy as np


def network(data_train, data_target, data_test, test_lable):
    """
    构建网络
    :param data_train: 
    :param data_target: 
    :param data_test: 
    :param test_lable: 
    :return: 
    """

    x = tf.placeholder("float", shape=[None,12])  # 行数不定(可能分批放入) 故None
    y = tf.placeholder("float", shape=[None,1])

    weight = tf.Variable(tf.random_normal(shape=[12,1]),name='weight')  # 每行输入12值 输出1值
    bias = tf.Variable(tf.random_normal(shape=[1]),name='bias')

    output = tf.matmul(x,weight)+bias  # output = XW+b
    pred = tf.cast(tf.sigmoid(output) > 0.5,
                   tf.float32)  # 转成0/1

    # 注意:logits取output 因为sigmoid_cross_entropy_with_logits里面要算sigmoid
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y,  # 目标值
                                                                  logits=output))  # 计算值

    train_step = tf.train.GradientDescentOptimizer(0.0003).minimize(loss)

    accuracy = tf.reduce_mean(tf.cast(tf.equal(pred,y),
                                      tf.float32))

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())  # 变量初始化
    loss_train = []  # 存每1000步的loss
    train_acc = []  # 存每1000步的训练集准确率
    test_acc = []  # 存每1000步的测试集准确率

    for epoch in range(25000):
        # 为防止过拟合,对训练顺序(数据索引)随机排序
        index = np.random.permutation(len(data_target))
        data_train = data_train.iloc[index]
        data_target = data_target[index]

        for batch in range(len(data_target)//100+1):  # batch = 0~9
            batch_xs = data_train[batch * 100:batch * 100 + 100]  # 每批取100个数据
            batch_ys = data_target[batch * 100:batch * 100 + 100]
            sess.run(train_step, feed_dict={x:batch_xs,
                                            y:batch_ys})

        # 每隔1000步 算一次训练loss和acc 和测试acc
        if epoch % 1000 == 0:
            loss_temp = sess.run(loss, feed_dict={x:batch_xs,
                                                  y:batch_ys})
            train_acc_temp = sess.run(accuracy, feed_dict={x:batch_xs,
                                                           y:batch_ys})
            test_acc_temp = sess.run(accuracy, feed_dict={x:data_test,
                                                          y:test_lable})
            loss_train.append(loss_temp)
            train_acc.append(train_acc_temp)
            test_acc.append(test_acc_temp)
            print('The epoch ',epoch,
                  ' loss = ',loss_temp,
                  ' train_acc = ',train_acc_temp,
                  ' test_acc = ',test_acc_temp,
                  '\n', end='')

    return loss_train, train_acc, test_acc

 【run.py 运行训练并绘制运行结果】

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from Tatanic.data_process import trainDataProcess,testDataProcess
from Tatanic.network import network


if __name__ == '__main__':

    # 训练集的输入、输出
    data_train, data_target = trainDataProcess('data/train.csv')

    # 测试集的输入、期望输出
    data_test = testDataProcess('data/test.csv')
    test_lable = pd.read_csv('data/gender_submission.csv')
    test_lable = np.reshape(test_lable.Survived.values.astype(np.float32),
                            newshape=(len(test_lable),1))  # 481*1
    # 训练
    loss_train, train_acc, test_acc = network(data_train=data_train,
                                              data_target=data_target,
                                              data_test=data_test,
                                              test_lable=test_lable)

    # 绘图
    plt.figure(1)
    plt.plot(loss_train, 'k-', label='train_loss')
    plt.legend()
    plt.title('Titanic death prediction - loss')

    plt.figure(2)
    plt.plot(train_acc, 'b-', label='train_acc')
    plt.plot(test_acc, 'r--', label='test_acc')
    plt.title('Titanic death prediction -acc')
    plt.legend()
    plt.show()


 【运行结果及分析】

train_acc波动大,是因为电脑比较菜,batch选择的太小(100)。

猜你喜欢

转载自blog.csdn.net/muyao987/article/details/87902443