Keras (18) Pre-defined estimator use, cross feature actual combat

This article will introduce:

  • Pre-defined estimator.LinearClassifier model
  • Pre-defined estimator.DNNClassifier model
  • Cross feature combat

One, load the Titanic data set

1. Download the Titanic data set, use pandas to read and parse the data set

The URL to download the data is as follows:

https://storage.googleapis.com/tf-datasets/titanic/train.csv
https://storage.googleapis.com/tf-datasets/titanic/eval.csv

Read the downloaded data into pandas, the code is as follows:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

# 打印使用的python库的版本信息
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())
print(eval_df.head())

2. Separate the characteristic value and target value

y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

print(train_df.head())
print(eval_df.head())
print(y_train.head())
print(y_eval.head())
3. Use panda to count the fields of numeric data
train_df.describe()

Second, use feature_column for data processing and convert it to tf.data.dataset type data

1. Integrate "discrete features" and "continuous features" into one-hot encoding
# 将特征分为"离散特征"和"连续特征"两个列表
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
# 使用tf.feature_column对"离散特征"做处理
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))

# 使用tf.feature_column对"连续特征"做处理
for categorical_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            categorical_column, dtype=tf.float32))
2. Convert ndarray data into BatchDataset type data in tf.data.dataset
def make_dataset(data_df, label_df, epochs = 10, shuffle = True,
                 batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

三,LinearClassifier

# 创建-LinearClassifier预定义estimator模型
linear_output_dir = 'linear_model'
if not os.path.exists(linear_output_dir):
    os.mkdir(linear_output_dir)
linear_estimator = tf.estimator.LinearClassifier(
    model_dir = linear_output_dir,
    n_classes = 2,
    feature_columns = feature_columns)
# 训练-LinearClassifier预定义estimator模型
linear_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

# 评估-LinearClassifier预定义estimator模型结果
linear_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

Fourth, DNNClassifier

# 创建-DNNClassifier预定义estimator模型
dnn_output_dir = './dnn_model'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir,
    n_classes = 2,
    feature_columns=feature_columns,
    hidden_units = [128, 128],
    activation_fn = tf.nn.relu,
    optimizer = 'Adam')
# 训练-DNNClassifier预定义estimator模型
dnn_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

# 评估-DNNClassifier预定义estimator模型结果
dnn_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

Five, cross-feature actual combat

1. Code implementation of cross feature
# 使用tf.feature_column.indicator_column对多个特征做<交叉特征>处理
# cross feature: age: [1,2,3,4,5], gender:[male, female]
# age_x_gender: [(1, male), (2, male), ..., (5, male), ..., (5, female)]
# 100000: 100 -> hash(100000 values) % 100 
feature_columns.append(
    tf.feature_column.indicator_column(
        tf.feature_column.crossed_column(
            ['age', 'sex'], hash_bucket_size = 100)))
2. The summary code is as follows
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 28 14:03:56 2020

@author: nijiahui
"""
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

# 打印使用的python库的版本信息
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

### 一,加载Titanic数据集 ##########################################
# 1,下载Titanic数据集,使用pandas读取并解析数据集
# https://storage.googleapis.com/tf-datasets/titanic/train.csv
# https://storage.googleapis.com/tf-datasets/titanic/eval.csv
train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())
print(eval_df.head())

# 2,分离出特征值和目标值
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

print(train_df.head())
print(eval_df.head())
print(y_train.head())
print(y_eval.head())

# 3,使用panda对数值型数据的字段进行统计
train_df.describe()

### 二,使用feature_column做数据处理,并转化为tf.data.dataset类型数据 ##############
# 1,将"离散特征"和"连续特征"整合为one-hot编码
# 将特征分为"离散特征"和"连续特征"两个列表
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class','deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
# 使用tf.feature_column对"离散特征"做处理
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))

# 使用tf.feature_column对"连续特征"做处理
for numeric_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            numeric_column, dtype=tf.float32))

# 使用tf.feature_column.indicator_column对多个特征做<交叉特征>处理
# cross feature: age: [1,2,3,4,5], gender:[male, female]
# age_x_gender: [(1, male), (2, male), ..., (5, male), ..., (5, female)]
# 100000: 100 -> hash(100000 values) % 100 
feature_columns.append(
    tf.feature_column.indicator_column(
        tf.feature_column.crossed_column(
            ['age', 'sex'], hash_bucket_size = 100)))

# 2,将ndarray数据转化为tf.data.dataset中的BatchDataset类型数据
def make_dataset(data_df, label_df, epochs = 10, shuffle = True,batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

### 三,LinearClassifier
# 创建-LinearClassifier预定义estimator模型
linear_output_dir = 'linear_model_new_features'
if not os.path.exists(linear_output_dir):
    os.mkdir(linear_output_dir)
linear_estimator = tf.estimator.LinearClassifier(
    model_dir = linear_output_dir,
    n_classes = 2,
    feature_columns = feature_columns)
# 训练-LinearClassifier预定义estimator模型
linear_estimator.train(input_fn = lambda : make_dataset(train_df, y_train, epochs = 100))

# 评估-LinearClassifier预定义estimator模型结果
linear_estimator.evaluate(input_fn = lambda : make_dataset(eval_df, y_eval, epochs = 1, shuffle = False))


### 四,DNNClassifier
# 创建-DNNClassifier预定义estimator模型
dnn_output_dir = './dnn_model_new_features'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir,
    n_classes = 2,
    feature_columns=feature_columns,
    hidden_units = [128, 128],
    activation_fn = tf.nn.relu,
    # optimizer = 'Adam'
    optimizer = 'SGD'
    )
# 训练-DNNClassifier预定义estimator模型
dnn_estimator.train(input_fn = lambda : make_dataset(train_df, y_train, epochs = 100))

# 评估-DNNClassifier预定义estimator模型结果
dnn_estimator.evaluate(input_fn = lambda : make_dataset(eval_df, y_eval, epochs = 1, shuffle = False))

Guess you like

Origin blog.csdn.net/TFATS/article/details/111694596