数据集如下图所示:
原先Year是2000——2013的,但是由于Year和Price数量级相差过大,导致梯度下降不能很快收敛,所以处理一下。
代码首先会展示两张动态图,展示梯度下降代价函数的下降过程以及直线的拟合过程。
拟合完毕(收敛后)会显示闭式解法拟合出的直线。
同时,两种方法都有展示预测的2014年房价。
python代码:
import csv
from numpy import *
import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator
def loadDataSet():
csv_reader=csv.reader(open('data.csv',encoding='utf-8-sig'))
data = [] ## 一个特征
label = []
i = 0
for row in csv_reader:
if i > 0: ## 去除表头
data.append([1.0, float(row[0])])
label.append(float(row[1]))
i = i + 1
return data, label
def closeFormSolution(data, label):
dataMat = mat(data)
labelMat = mat(label).transpose()
theta = linalg.inv(dataMat.transpose() * dataMat) * dataMat.transpose() * labelMat
# theta = theta.tolist()
return theta
def gradientDescent(data, label):
dataMat = mat(data)
labelMat = mat(label).transpose()
n,m = shape(dataMat) ## n样本 m特征
theta = ones((m,1)) ##初始值
alpha = 0.0001 ##学习率
maxCycle = 10000
epsilon = 0.0002
error = dataMat * theta - labelMat
precost = 1 / 2 * error.transpose() * error
# 画图部分
plt.ion()
xs = [0, 0]
ys= [0, precost[0, 0]]
# 画图部分
for k in range(maxCycle):
theta = theta - alpha * (dataMat.transpose() * error)
error = dataMat * theta - labelMat
cost = 1/2 * error.transpose() * error
xs[0] = xs[1]
ys[0] = ys[1]
xs[1] = k
ys[1] = cost[0, 0]
plt.figure(1)
plt.title('costFunction', fontsize=14)
plt.xlabel('num of iterations', fontsize=8)
plt.ylabel('cost', fontsize=8)
plt.plot(xs, ys, color = 'red')
plt.figure(2)
plotRegression(data, label, theta, 'gradientDescent')
plt.pause(0.1)
if abs(precost - cost) < epsilon: # cost变化已不大,收敛
break
precost = cost
return theta
def plotRegression(data, label, theta, title):
plt.clf()
x = arange(0, 20)
y = theta[0] + theta[1] * x
x = x.tolist()
y = y.transpose().tolist()
ax = plt.subplot()
plt.title(title , fontsize=14)
plt.xlabel('year(+2000)', fontsize=8)
plt.xticks(x, (2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019))
plt.ylabel('price', fontsize=8)
x_major_locator = MultipleLocator(1)
y_major_locator = MultipleLocator(2)
ax.xaxis.set_major_locator(x_major_locator)
ax.yaxis.set_major_locator(y_major_locator)
plt.xlim(0, 16)
plt.ylim(0, 16)
plt.plot([data[i][1] for i in range(0, 14)], label, "ob")
plt.plot(14, y[14][0], 'om')
plt.text(14, y[14][0] + 0.8, '2014_prediction', color='b', fontsize=10)
plt.text(14, y[14][0], '%.2f'%y[14][0], color = 'b', fontsize=10)
plt.plot(x, y, color='red')
def main():
data, label = loadDataSet()
theta = gradientDescent(data, label)
print("梯度下降theta")
print(theta)
plt.ioff()
plt.figure(3)
theta1 = closeFormSolution(data, label)
plotRegression(data, label, theta1, 'closeFormSolution')
print("闭式解theta")
print(theta1)
plt.show()
if __name__=='__main__':
main()