二：逻辑回归+正则化

建立一个逻辑回归模型来预测一个学生是否被大学录取，根据两次考试的结果来决定每个申请人的录取机会，有以前的申请人的历史数据，可以用它作为逻辑回归的训练集。

python实现逻辑回归目标：建立分类器（求解出三个参数θ0θ1θ2）即得出界线。备注：θ1对应Exam1成绩，θ2对应Exam2 设定阈值，根据阈值判断录取结果备注：阈值指的是最终得到的概率值，将概率值转化成一个类别，一般是>0.5是被录取，<0.5未被录取。

一：init.py (逻辑回归主函数)

#逻辑回归主函数
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns #绘制散点图
plt.style.use('fivethirtyeight') #样式美化
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report#这个包是评价报告
import sigmoid as sg #sigmoid函数
import Cost_function as cf #代价函数
import gradient_descent as gd #梯度下降
import scipy.optimize as opt #在这里寻找高级优化算法

data = pd.read_csv(r'******', names = ['exam1', 'exam2', 'admitted'])
print(data.head())
#画出散点图
sns.set(context = "notebook", style = "darkgrid", palette = sns.color_palette("RdBu", 2), color_codes = False) #设置参数样式，默认主题darkgrid（灰色背景+白网格）
sns.lmplot('exam1', 'exam2', hue = 'admitted', data = data,
           height = 6,
           fit_reg = False,  #fit_feg 参数，控制是否显示拟合的直线
           scatter_kws = {"s": 50}
          )     #hue参数是将name所指定的不同类型的数据叠加在一张图中显示
#plt.show()
def get_X(df): #读取特征值
    ones = pd.DataFrame({'Ones': np.ones(len(df))})
    data = pd.concat([ones, df], axis = 1) #以上两行相当于添加了X0=1 的一列
    return data.iloc[:, :-1].values #这个操作返回ndarray，不是矩阵。 as_matrix()已经被淘汰
def get_y(df): #读取标签
    return np.array(df.iloc[:, -1])
def normalize_feature(df):
    return df.apply(lambda column: (column - column.mean()) / column.std()) #特征归一化（特征缩放）

#设置X,y，theta
X = get_X(data)
y = get_y(data)
theta = np.zeros(3) #np.zeros(k) 创建的是矩阵，并存在向量，为k*1的矩阵
# print(X.shape)
# print(y.shape)
print(cf.cost(theta, X, y))
print(gd.gradient(theta, X, y)) #使用梯度下降算法优化θ

#使用scipy.optimize.minimize 里的高级优化算法去寻找参数θ
res = opt.minimize(fun= cf.cost, x0 = theta, args = (X, y), method = 'Newton-CG', jac = gd.gradient)
print(res)

# 用训练集预测和验证
def predict(x, theta): # 实现变量类型转换
    y_pred = sg.sigmoid(X.dot(theta))
    return (y_pred >= 0.5).astype(int)
final_theta = res.x
y_pred = predict(X, final_theta)
print(classification_report(y, y_pred))

#寻找决策边界
#θ2对应的是exam2的成绩在这里把x2用y代替，为了方便画图，同时 把θ参数y为基准归一 便于画图。
coef = -(res.x / res.x[2])
x = np.arange(130, step = 0.1)
y = coef[0] + coef[1] * x
#画出散点图和决策分界线
sns.set(context="notebook", style="ticks", font_scale=1.5)
sns.lmplot('exam1', 'exam2', hue='admitted', data=data,
           height=6,
           fit_reg=False,
           scatter_kws={"s": 25}
          )
plt.plot(x, y, 'grey')
plt.xlim(0, 130)
plt.ylim(0, 130)
plt.title('Decision Boundary')

二：Regularized.py(正则化主函数)

#正则化主函数
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns #绘制散点图
plt.style.use('fivethirtyeight') #样式美化
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report#这个包是评价报告
import feature_mapping as fm
import regularized_function as rf
import sigmoid as sg #sigmoid函数
import scipy.optimize as opt #用高级算法拟合参数

def predict(x, theta): # 实现变量类型转换
    y_pred = sg.sigmoid(X.dot(theta))
    return (y_pred >= 0.5).astype(int)

df = pd.read_csv(r'******', names = ['test1', 'test2', 'accepted'])
#print(df.head())
#绘制散点图
sns.set(context="notebook", style="ticks", font_scale=1.5)
sns.lmplot('test1', 'test2', hue='accepted', data=df,
           height=6,
           fit_reg=False,
           scatter_kws={"s": 50}
          )
plt.title('Regularized Logistic Regression')
#plt.show()

x1 = np.array(df.test1)
x2 = np.array(df.test2)

data2 = fm.feature_mapping(x1, x2, power = 6)
theta = np.zeros(data2.shape[1])
X = fm.feature_mapping(x1, x2, power = 6, as_ndarray=True)
y = np.array(df.iloc[:, -1])
# print(X.shape)
# print(y.shape)

print(rf.regularized_cost(theta, X, y, l = 1)) #正则化代价函数
print(rf.regularized_gradient(theta, X, y)) #正则化梯度

#拟合参数
print('init cost = {}'.format(rf.regularized_cost(theta, X, y)))

res = opt.minimize(fun = rf.regularized_cost, x0 = theta, args = (X, y), method = 'Newton-CG', jac = rf.regularized_gradient)
print(res)
final_theta = res.x
y_pred = predict(X, final_theta)
print(classification_report(y, y_pred)) #预测

#使用不同的?  画出决策边界
def draw_boundary(power, l): #展示一个画图
    density = 1000
    threshhold = 2 * 10 ** -3

    final_theta = feature_mapped_logistic_regression(power, l)
    x, y = find_decision_boundary(density, power, final_theta, threshhold)

    df = pd.read_csv(r'******', names=['test1', 'test2', 'accepted'])
    sns.lmplot('test1', 'test2', hue='accepted', data=df, height=6, fit_reg=False, scatter_kws={"s": 100})

    plt.scatter(x, y, c='R', s=10)
    plt.title('Decision boundary')
    plt.show()

def feature_mapped_logistic_regression(power, l):
    df = pd.read_csv(r'******', names=['test1', 'test2', 'accepted'])
    x1 = np.array(df.test1)
    x2 = np.array(df.test2)
    y = np.array(df.iloc[:, -1])

    X = fm.feature_mapping(x1, x2, power, as_ndarray=True)
    theta = np.zeros(X.shape[1])

    res = opt.minimize(fun=rf.regularized_cost,
                       x0=theta,
                       args=(X, y, l),
                       method='TNC',
                       jac=rf.regularized_gradient)
    final_theta = res.x

    return final_theta

#寻找决策边界函数
def find_decision_boundary(density, power, theta, threshhold):
    t1 = np.linspace(-1, 1.5, density)
    t2 = np.linspace(-1, 1.5, density)

    cordinates = [(x, y) for x in t1 for y in t2]
    x_cord, y_cord = zip(*cordinates)
    mapped_cord = fm.feature_mapping(x_cord, y_cord, power)  # this is a dataframe

    inner_product = mapped_cord.values @ theta

    decision = mapped_cord[np.abs(inner_product) < threshhold]

    return decision.f10, decision.f01

draw_boundary(power=6, l=0)     #set lambda = 1

三：sigmoid.py(sigmoid函数)

import numpy as np
import matplotlib.pyplot as plt

def sigmoid(z):
    gz = 1 / (1 + np.exp(-z))
    return gz

#绘制sigmoid函数的图像
# fig, ax = plt.subplots(figsize=(8, 6))
# ax.plot(np.arange(-10, 10, step=0.01),
#         sigmoid(np.arange(-10, 10, step=0.01)))
# ax.set_ylim((-0.1,1.1))
# ax.set_xlabel('z', fontsize=18)
# ax.set_ylabel('g(z)', fontsize=18)
# ax.set_title('sigmoid function', fontsize=18)
# plt.show()

四： Cost_function.py(代价函数)

#代价函数
import numpy as np
import sigmoid as sg #sigmoid函数

def cost(theta, X, y):
    h = sg.sigmoid(X.dot(theta))
    costf = np.sum((-y * np.log(h)) - (1 - y) * np.log(1 - h)) / len(X)
    return costf

五：gradient_descent.py(梯度下降函数)

#梯度下降函数
import sigmoid as sg

def gradient(theta, X, y):
    h = sg.sigmoid(X.dot(theta))
    grad = X.T.dot(h - y) / len(X)
    return grad

六：feature_mapping.py(特征映射)

#特征映射
#如果样本量多，逻辑回归问题很复杂，而原始特征只有x1,x2可以用多项式创建更多的特征x1、x2、x1x2、x1^2、x2^2、... X1^nX2^n。
#因为更多的特征进行逻辑回归时，得到的分割线可以是任意高阶函数的形状。
import numpy as np
import pandas as pd

def feature_mapping(x, y, power, as_ndarray=False):
#     """return mapped features as ndarray or dataframe"""

    data = {"f{}{}".format(i - p, p): np.power(x, i - p) * np.power(y, p)
                for i in np.arange(power + 1)
                for p in np.arange(i + 1)
            }

    if as_ndarray:
        return pd.DataFrame(data).values
    else:
        return pd.DataFrame(data)

七：regularized_function.py(正则化相关函数)

#正则化的相关函数
import numpy as np
import Cost_function as cf
import gradient_descent as gd

#正则化代价函数
def regularized_cost(theta, X, y, l = 1):
    theta_j1_to_n = theta[1:]
    regularized_term = (l / (2*len(X))) * np.power(theta_j1_to_n, 2).sum() #注意这个地方 len前后的括号要括号
    regu_cost = cf.cost(theta, X, y) + regularized_term
    return regu_cost

#正则化梯度
def regularized_gradient(theta, X, y, l = 1):
    theta_j1_to_n = theta[1:]
    regularized_theta = (l / len(X)) * theta_j1_to_n
    # 因为θ从1开始 所以需要将0数组和以计算好的数组进行拼接，达到与参数数量一致
    regularized_term = np.concatenate([np.array([0]), regularized_theta])
    return gd.gradient(theta, X, y) + regularized_term

Chris_hx

发布了40 篇原创文章 · 获赞 4 · 访问量 5177

私信关注