分析加州大学学生录取的数据

# -*- coding: utf-8 -*-
"""
Created on Thu Mar 26 13:34:04 2020

@author: 陨星落云
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 读取数据
data = pd.read_csv('student_data.csv')

# print(data.head(10))

# 对数据进行绘图
def plot_points(data):
    X = np.array(data[['gre','gpa']])
    y = np.array(data['admit'])
    admitted = X[np.argwhere(y==1)]
    # print(admitted)
    rejected = X[np.argwhere(y==0)]
    plt.scatter([s[0][0] for s in rejected], [s[0][1] for s in rejected], s =25,color = 'red',edgecolor='k')
    plt.scatter([s[0][0] for s in admitted], [s[0][1] for s in admitted], s =25,color = 'cyan',edgecolor='k')
    plt.xlabel('Test(GRE)')
    plt.ylabel('Grades(GPA)')

# plot_points(data)
# plt.show()

# 评级
data_rank1 = data[data['rank']==1]
data_rank2 = data[data['rank']==2]
data_rank3 = data[data['rank']==3]
data_rank4 = data[data['rank']==4]
# print(data_rank1)

# # 绘图
# plt.figure(figsize=(12,12),dpi=80)
# plt.subplot(221)
# plot_points(data_rank1)
# plt.title('Rank1')
# plt.subplot(222)
# plot_points(data_rank2)
# plt.title('Rank2')
# plt.subplot(223)
# plot_points(data_rank3)
# plt.title('Rank3')
# plt.subplot(224)
# plot_points(data_rank4)
# plt.title('Rank4')
# plt.show()

# one-hot编码
one_hot_data = pd.concat([data,pd.get_dummies(data['rank'],prefix='rank')],axis=1)
# print(one_hot_data[:10])

# 缩放数据
# 复制数据
processed_data = one_hot_data[:]
# 归一化
processed_data['gre'] = processed_data['gre']/800
processed_data['gpa'] = processed_data['gpa']/4.0
# print(processed_data[:10])

# 将数据集分为训练集0.9与测试集0.1
sample = np.random.choice(processed_data.index,size=int(len(processed_data)*0.6),replace=False)
train_data,test_data = processed_data.iloc[sample],processed_data.drop(sample)
# print(train_data[:10])
# print(test_data[:10])

# 将数据分成特征与目标(标签)
features = train_data.drop('admit',axis=1)
targets = train_data['admit']
features_test = test_data.drop('admit',axis=1)
targets_test = test_data['admit']

# print(features[:10])
# print(targets[:10])

# 训练二层神经网络
# 激活函数sigmoid
def sigmoid(x):
    return 1/(1+np.exp(-x))
def sigmoid_prime(x):
    return sigmoid(x)*(1-sigmoid(x))
def error_formula(y,output):
    return -y*np.log(output) - (1-y)*np.log(1-output)

# 误差反向传播
def error_term_formula(y,output):
    return y-output

epochs = 1000
learnrate = 0.05

def train_nn(features,targets,epochs,learnrate):
    np.random.seed(42)
    
    n_records,n_features = features.shape
    last_loss = None
    
    # 初始化权值
    weights = np.random.normal(scale=1/n_features**.5,size=n_features)
    # print(1/n_features**.5)
    
    for e in range(epochs):
        del_w = np.zeros(weights.shape)
        for x,y in zip(features.values,targets):
            
            output = sigmoid(np.dot(x,weights))
            
            error = error_formula(y,output)
            # 误差反向传播
            error_term = error_term_formula(y,output)
            
            del_w += error_term*x
        # 更新权值
        weights += learnrate*del_w/n_records
        # 训练集误差
        if e%(epochs/10) == 0:
            out = sigmoid(np.dot(features,weights))
            loss = np.mean((out-targets)**2)
            print('Epoch:',e)
            if last_loss and last_loss < loss:
                print('Train loss:',loss,'Waring-Loss Incraesing')
            else:
                print('Train loss:',loss)
            last_loss = loss
            print('========')
    print('Finished training!')
    return weights

weights = train_nn(features,targets,epochs,learnrate)

# 计算测试(Test)数据的精确度
test_out = sigmoid(np.dot(features_test,weights))
predictions = test_out > 0.5
accuracy = np.mean(predictions == targets_test)
print('Prediction accuracy:{:.3f}'.format(accuracy))

Epoch: 0
Train loss: 0.34068382193501623
========
Epoch: 100
Train loss: 0.201014332510653
========
Epoch: 200
Train loss: 0.1995071314375893
========
Epoch: 300
Train loss: 0.1987857819072691
========
Epoch: 400
Train loss: 0.19840143813645875
========
Epoch: 500
Train loss: 0.19816387691097853
========
Epoch: 600
Train loss: 0.1979924941483765
========
Epoch: 700
Train loss: 0.19785272862525882
========
Epoch: 800
Train loss: 0.19772944768387532
========
Epoch: 900
Train loss: 0.19761581319040344
========
Finished training!
Prediction accuracy:0.681

发布了82 篇原创文章 · 获赞 39 · 访问量 5万+

猜你喜欢

转载自blog.csdn.net/qq_28368377/article/details/105120969
今日推荐