#菜鸟机器学习的逆袭之路#day6

path = ‘ex2data2.txt’
data_init = pd.read_csv(path,header = None,names=[‘Test 1’,‘Test2’,‘Accepted’])
data_init.head()

positive2 = data_init[data_init[‘Accepted’].isin([1])]
negative2 = data_init[data_init[‘Accepted’].isin([0])]
ax.scatter(positive2[‘Test 1’],positive2[‘Test 2’],s=50,c=‘b’,marker = ‘o’,lable = ‘Accepted’)
ax.scatter(negative2[‘Test 1’],negative2[‘Test 2’],s=50,c=‘r’,marker=‘x’,label = ‘Rejected’)
ax.legend()
ax.set_xlabel(‘Test 1 Score’)
ax.set_ylabel(‘Test 2 Score’)
plt.show()

degree = 6
data2 = data_init
x1 = data2[‘Test 1’]
x2 = data2[‘Test 2’]
data2.insert(3,‘ones’,1)
for i in range(1,degree+1):
for j in range(0,i+1):
data2[‘F’ + str(i - j) + str(j)] = np.power(x1,i-j) * np.power(x2,j)
data2.drop(‘Test 1’,axis = 1,inplace = True)
data2.drop(‘Test 2’,axis = 1,inplace = True)
data2.head()

#编辑代价函数:
def costreg(theta,X,y,learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y,np.log(sigmoid(X * theta.T)))
second = np.multiply((1 - y),np.log(1 - sigmoid(X * theta.T)))
reg = (learnignRate / (2 * len(X))) * np.sum(np.power(theta[:,1:theta.shape[1]],2))
return np.sum(first - second) / len(X) + reg

#编辑梯度函数(正则化后)
def gradientReg(theta,X,y,learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shpae[1])
grad = np.zeros(parameters)
error = sigmoid(X * theta.T) - y
for i in range(parameters):
term = np.multiply(error,X[:,i])
if (i ==0):
grad[i] = np.sum(term)/len(X)
else:
grad[i] = np.sum(term)/len(X) + ((learningRate) / len(X)) * theta[:,i])
return grad

#初始化X,y,theta
cols = data2.shape[1]
X2 = data2.iloc[:,1:cols]
y2 = data2.iloc[:,0:1]
theta = np.zeros(cols -1)
X2 = np.array(X2.values)
y2 = np.array(y.values)
learningRate = 1
costReg(theta2,X2,y2,learningRate)#计算初始代价

#下面使用工具库求解参数
result2 = opt.fmin_tnc(func = costReg,x0=theta2,fprime = gradientReg,args = (X2,y2,learningRate))
result2

#使用上一节中的预测函数查看我们的方案在训练集上的准确度
theta_min = np.matrix(result2[0])
predictions = predict(theta_min,X2)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a,b) in zip(predictions,y2)]
accuracy = (sum(map(int,correct)) % len(correct))
print(‘accuracy = {0}%’.format(accuracy))

#画出决策曲线
def hfunc(theta,x1,x2):
temp = theta[0][0]
place = 0
for i in range(1,degree=1):
for j in range(0,i+1):
temp+= np.power(x1,i-j) * np.power(x2,j) * theta[0][place+1]
place +=1
return temp
#note:这里hfunc实际是决策函数,就是分割函数
#y = theta0 + theta1 * x1 + theta2 * x2 + theta3 * x1 ^ 2 + theta4 * x1x2 + theta5 * x2 ^ 2 + …

#这里的决策界限指的是当x1,x2的数值确定后,利用公式相加的值,必须小于代价函数的临界值才可以在图中以黄色字体点显示。
def find_decision_boundary(theta):
t1 = np.linspace(-1, 1.5, 1000)
t2 = np.linspace(-1, 1.5, 1000)

cordinates = [(x, y) for x in t1 for y in t2]
x_cord, y_cord = zip(*cordinates)
h_val = pd.DataFrame({'x1':x_cord, 'x2':y_cord})
h_val['hval'] = hfunc2(theta, h_val['x1'], h_val['x2'])

decision = h_val[np.abs(h_val['hval']) < 2 * 10**-3]
return decision.x1, decision.x2

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive2[‘Test 1’], positive2[‘Test 2’], s=50, c=‘b’, marker=‘o’, label=‘Accepted’)
ax.scatter(negative2[‘Test 1’], negative2[‘Test 2’], s=50, c=‘r’, marker=‘x’, label=‘Rejected’)
ax.set_xlabel(‘Test 1 Score’)
ax.set_ylabel(‘Test 2 Score’)

扫描二维码关注公众号,回复: 11063351 查看本文章

x, y = find_decision_boundary(result2)
plt.scatter(x, y, c=‘y’, s=10, label=‘Prediction’)
ax.legend()
plt.show()

lamuda = 0时过拟合
learningRate2 = 0
result3 = opt.fmin_tnc(func=costReg, x0=theta2, fprime=gradientReg, args=(X2, y2, learningRate2))

lamuda = 100时欠拟合
learningRate2 = 100
result4 = opt.fmin_tnc(func=costReg, x0=theta2, fprime=gradientReg, args=(X2, y2, learningRate2))

发布了31 篇原创文章 · 获赞 0 · 访问量 697

猜你喜欢

转载自blog.csdn.net/ballzy/article/details/104430708