统计学习方法_感知机实现

数据集为二值化的MNIST，下载地址：MNIST

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
import time
import cv2
from sklearn.cross_validation import train_test_split


# 提取hog特征，784 -> 324
def get_hog_features(trainset):
    features = []

    hog = cv2.HOGDescriptor('./hog.xml')  # 读取配置文件

    for image in trainset:
        image = image.reshape(28, 28)
        cv_img = image.astype(np.uint8)  # uint8范围为0-255，和像素数值范围相同

        hog_feature = hog.compute(cv_img)
        features.append(hog_feature)

    features = np.array(features)
    features = features.reshape(-1, 324)  # 第一维任意，第二维为提取到的特征18*18

    return features

def Train(trainset, train_labels):
    trainset_size = len(train_labels)

    # 初始化w和b
    w = np.zeros((feature_length, 1))  # shape (D,1)
    b = 0

    # 学习次数，只有当(xi,yi)分类错误时才增加
    study_count = 0
    # 统计连续分类正确数，当分类错误时为0
    nochange_count = 0
    # 连续分类正确上界，当到达此数代表已训练好
    nochange_upper_limit = 100000

    while True:
        nochange_count += 1
        if nochange_count > nochange_upper_limit:
            break

        # 随机选一个数据
        index = np.random.randint(0, trainset_size)  # 前闭后开
        image = trainset[index]  # shape (D,)
        label = train_labels[index]

        # 计算yi(w*xi+b)，如果label为1则为正类1，label为0则为负类-1
        yi = int(label != object_num) * 2 - 1
        result = yi * (image.dot(w) + b)

        # 如果为误分类就需要更新w和b
        if result <= 0:
            # 为了下面的计算，需要重新设置维度
            image = image.reshape(feature_length, 1)

            w += learning_rate * yi * image
            b += learning_rate * yi

            study_count += 1
            if study_count > nochange_upper_limit:
                break
            nochange_count = 0

    return w, b

def Predict(test_set, w, b):
    predict = []
    for image in test_set:
        result = image.dot(w) + b
        result = result > 0  # >0为True，<0为False
        predict.append(result)
    return np.array(predict)


feature_length = 324  # hog特征维度
learning_rate = 0.0001  # 学习率
object_num = 0  # 分类的数字，如果数字为0，标签为1
study_total = 10000  # 设置最多迭代次数


if __name__ == '__main__':
    print('Start reading data:')

    time1 = time.time()

    # raw_data为pandas的DataFrame类型
    # 读取csv并去除第一行，从数据的第一行开始读
    # 每一行数据为：第一列标签，后面每一列为像素 28*28=784
    # label>0的设置成1，label为0的数据不变
    raw_data = pd.read_csv('./data/train_binary.csv', header=0)
    # 返回值为numpy的ndarray类型，shape(42000,785)
    data = raw_data.values

    img = data[:, 1:]  # 第二列开始为数据
    labels = data[:, 0]  # 第一列为label

    print(img.shape)
    print(labels.shape)

    # 利用hog提取特征，784 -> 324
    features = get_hog_features(img)
    print(features.shape)

    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=11111)
    print(train_features.shape)
    print(test_features.shape)

    time2 = time.time()
    print('read data cost %f seconds' % (time2 - time1))

    print('Starting training:')
    w, b = Train(train_features, train_labels)
    time3 = time.time()
    print('training cost %f seconds' % (time3 - time2))

    print('Starting predicting:')
    test_predict = Predict(test_features, w, b)
    time4 = time.time()
    print('predicting cost %f seconds' % (time4 - time3))

    # label = 0 负分类-1
    # label = 1 正分类1
    accuracy = np.sum(test_labels == test_predict.reshape(len(test_labels))) / len(test_labels)
    print('The accuracy is: %f!' % accuracy)

'''
output:
Start reading data:
(42000, 784)
(42000,)
(42000, 324)
(28140, 324)
(13860, 324)
read data cost 6.194034 seconds
Starting training:
training cost 46.450333 seconds
Starting predicting:
predicting cost 0.081242 seconds
The accuracy is: 0.996609!
'''
统计学习方法_感知机实现

数据集为二值化的MNIST，下载地址：MNIST

猜你喜欢