基于高斯分布的异常检测代码实现




import matplotlib.pyplot as plt
import numpy as np
import csv
from numpy import genfromtxt
from scipy.stats import multivariate_normal
from sklearn.metrics import f1_score

#画图设置
plt.style.use('ggplot')

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13



#读取文件
reader = csv.reader(open("train_server_data.csv", "r"), delimiter=",")
reader1 = csv.reader(open("crossval_server_data.csv", "r"), delimiter=",")
reader2 = csv.reader(open("test_server_data.csv", "r"), delimiter=",")
#转为list格式
tr = list(reader)
cv = list(reader1)
ts = list(reader2)
#得到训练,交叉,测试数据
train_data = np.array(tr[: :]).astype("float")
crossval_data = np.array(cv[: :]).astype("float")
test_data = np.array(ts[: :]).astype("float")

'''
def read_dataset(filePath, delimiter=','):
    return genfromtxt(filePath, delimiter=delimiter)

#特征归一化
def feature_normalize(dataset):
    mu = np.mean(dataset, axis=0)
    sigma = np.std(dataset, axis=0)
    return (dataset - mu) / sigma
'''
#高斯预测
def estimate_gaussian(dataset):
    mu = np.mean(dataset, axis=0)
    sigma = np.cov(dataset.T)  #求协方差,协方差表示两个变量在一起的水平。
    return mu, sigma

#多元高斯
def multivariate_gaussian(dataset, mu, sigma):
    p = multivariate_normal(mean=mu, cov=sigma)
    return p.pdf(dataset)

#选择阈值
def select_threshold(probs, test_data):
    best_epsilon = 0
    best_f1 = 0
    f = 0
    stepsize = (max(probs) - min(probs)) / 1000;
    epsilons = np.arange(min(probs), max(probs), stepsize)
    for epsilon in np.nditer(epsilons):
        predictions = (probs < epsilon)
        f = f1_score(test_data, predictions, average='binary')
        if f > best_f1:
            best_f1 = f
            best_epsilon = epsilon

    return best_f1, best_epsilon

mu, sigma = estimate_gaussian(train_data)
p = multivariate_gaussian(train_data,mu,sigma)

#利用交叉熵寻找最佳阈值ep。selecting optimal value of epsilon using cross validation
p_cv = multivariate_gaussian(crossval_data,mu,sigma)
fscore, ep = select_threshold(p_cv,test_data)
print(fscore, ep)

#选择异常点
outliers = np.asarray(np.where(p < ep))

plt.figure(1)
plt.xlabel('motor1')
plt.ylabel('motor2')
plt.title('Datapoints of distribution')
plt.plot(train_data[:,0], train_data[:,1],'b+')
plt.show()

plt.figure(2)
plt.xlabel('motor1')
plt.ylabel('motor2')
plt.title('Detection of Outliers')
plt.plot(train_data[:,0],train_data[:,1],'bx')
plt.plot(train_data[outliers,0],train_data[outliers,1],'ro')  #把异常点标记红色
plt.show()

'''
C:\Users\z003tesj\AppData\Local\Programs\Python\Python35\lib\site-packages\sklearn\metrics\classification.py:1135: 
UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
0.875 9.036201327981212e-05
C:\Users\z003tesj\AppData\Local\Programs\Python\Python35\lib\site-packages\matplotlib\font_manager.py:1320: 
UserWarning: findfont: Font family ['serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))
[Finished in 14.4s]
'''

猜你喜欢

转载自blog.csdn.net/btujack/article/details/81181566