数据挖掘day16、17-CS229-WEEK3 Logistic Regression

在这里插入图片描述
今天参考github的项目进行分类实现,使用的是课程课后练习数据
使用的是随机梯度下降, α \alpha 的选择和遍历次数设置,还是复杂一点。原 文章用的数据分类清晰,效果比较好,课后习题的数据分类比较模糊,如果输出的不是’0‘、’1’,而是 h θ ( x ) h_{\theta}(x) 。要达到较好的效果(收敛)需要遍历350次,如果只是画边界线的话,30次就不错了。
下图是遍历30次的图像,明显没有收敛。
30次
但是边界线的效果已经可以了:(边界线函数 y = l n 1 θ 0 θ 1 x θ 2 y=\frac{ln1-\theta_0-\theta_1x}{\theta_2}
在这里插入图片描述

from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from math import exp, log 
import math

def Nomalize_data(data):
    for i in range(data.shape[1]):
        data[:,i]=(data[:,i]-data[:,i].mean())/data[:,i].std()
    return data
def sigmoid_function(z): 
	g = 1 / (1 + exp(-z))
	return g
def hypothesis(x, THETA):	
	hypothesis = np.matmul(THETA.T, x)
	hypothesis = sigmoid_function(hypothesis[0])
	return hypothesis
def compute_loss(X, Y, THETA):	
	loss = 0
	for x, y in zip(X, Y):
		h_x = hypothesis(x, THETA)
		# if h_x == 1 --> log(1-1) --> error
		if h_x == 1:
			h_x = 1-0.0000000000001
		loss += (-y) *(log(h_x) - (1-y) *log(1-h_x))
	return loss/(X.shape[0])
def update_parameters(THETA, LR, y, h_x, x):
	x = np.reshape(x, THETA.shape)
	THETA = THETA + LR *(y - h_x) * x
	return THETA
if __name__ == '__main__':
	# 数据导入及处理
	LR = 0.003
	EPOCH = 30
	data=np.loadtxt('ex2data1.txt',delimiter=',')
	X=data[:,:2]
	Y=data[:,2]
	X_train=Nomalize_data(X)
	# X_train=X
	Y_train=Y
	o_train = np.ones([X_train.shape[0], 1], dtype=X_train.dtype)
	X_train = np.concatenate((o_train, X_train), axis=1)
	H_train = np.zeros([Y_train.shape[0], 1], dtype=Y_train.dtype)
	THETA = np.random.normal(0, 0.1, 3).reshape(3, 1) # learnable parameters
	#主要计算,及J函数图像
	plt.figure(0)
	cost=[]
	for epoch in range(EPOCH):
		i = 0 # retrieve H_x
		for x, y in zip(X_train, Y_train):
			loss = compute_loss(X_train, Y_train, THETA)
			H_train[i] = hypothesis(x, THETA)
			THETA = update_parameters(THETA, LR, y, H_train[i], x)
			cost.append(loss)
			i+=1  
	print(THETA)
	plt.scatter(range(len(cost)),cost)
	plt.show()
	#边界图
	plt.figure(1)
	x = np.linspace(-2,1.5, 50)
	y = (log(1,math.e)-THETA[0]-THETA[1]*x)/THETA[2]
	plt.plot(x,y)
	plt.scatter(X_train[:, 1], X_train[:, 2], c=Y_train, edgecolors='white', marker='s')
	print(X_train[50:60, 1], X_train[50:60, 2], H_train[50:60, 0])
	plt.show()
发布了90 篇原创文章 · 获赞 3 · 访问量 4928

猜你喜欢

转载自blog.csdn.net/weixin_43329319/article/details/98092843