1 异常检测
对于异常检测的定义我觉得很难定义,这个是个业务概念。数据认知的异常与人类认知的异常的结合。通过无监督的机器学习方法去算法帮我们快速找到潜在的异常的数据,帮助我们缩小人工筛查范围。我就谈谈风控中的异常检测吧。
2 风控中异常检测
孤立点异常
比如移动端游戏场景,某些用户采用xposed或者是修改游戏的lua脚本,直接修改变量值。这种单个用户如果不采取措施,很快就会被突破口子。
聚集异常
多用户多批量采用相同的操作手法来刷接口,刷广告流量,刷注册,发垃圾广告,刷推广等等。一般情况下在大数据平台中通过group by 进行大量地聚集就能缩小范围。但是还是不够智能,只能防一般般的黑灰产,对于有组织 有纪律的黑灰产还是得依赖算法进行标注。
3 代码
对于异常检测的定义,网上文章汗牛充栋,作为经常copy paster的 我只能大概了解一下常用的使用场景,仅以此文记录一下一些经典的常用的基于 和 pyod 的异常检测demo代码 。
sklearn 多不用多介绍,本身了自带不少写代码的库
Pyod是赵越博士的
具体查看 https://zhuanlan.zhihu.com/p/58313521
3.1 sklearn
svm
EllipticEnvelope
IsolationForest
LocalOutlierFactor
pyod
Talk is cheap ,show me the code !
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.font_manager
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
rng = np.random.RandomState(42)
# Example settings
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]
# define two outlier detection tools to be compared
classifiers = {
"One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
kernel="rbf", gamma=0.1),
"Robust covariance": EllipticEnvelope(contamination=outliers_fraction),
"Isolation Forest": IsolationForest(max_samples=n_samples,
contamination=outliers_fraction,
random_state=rng),
"Local Outlier Factor": LocalOutlierFactor(
n_neighbors=35,
contamination=outliers_fraction)}
# Compare given classifiers under given settings
xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.ones(n_samples, dtype=int)
ground_truth[-n_outliers:] = -1
# Fit the problem with varying cluster separation
for i, offset in enumerate(clusters_separation):
np.random.seed(42)
# Data generation
X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset
X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset
X = np.r_[X1, X2]
# Add outliers
X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
# Fit the model
plt.figure(figsize=(9, 7))
for i, (clf_name, clf) in enumerate(classifiers.items()):
# fit the data and tag outliers
if clf_name == "Local Outlier Factor":
y_pred = clf.fit_predict(X)
scores_pred = clf.negative_outlier_factor_
else:
clf.fit(X)
scores_pred = clf.decision_function(X)
y_pred = clf.predict(X)
# 选取预定的前25%的分数的分界线作为阈值
threshold = stats.scoreatpercentile(scores_pred, 100 * outliers_fraction)
# 计算误差
n_errors = (y_pred != ground_truth).sum()
# 绘制等高线
if clf_name == "Local Outlier Factor":
# decision_function is private for LOF
Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
subplot = plt.subplot(2, 2, i + 1)
subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
cmap=plt.cm.Blues_r)
# 用红线画阈值边界
a = subplot.contour(xx, yy, Z, levels=[threshold],
linewidths=2, colors='red')
# 用橙色填充阈值区域内的背景
subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
colors='orange')
b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white',
s=20, edgecolor='k')
c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black',
s=20, edgecolor='k')
subplot.axis('tight')
subplot.legend(
[a.collections[0], b, c],
['learned decision function', 'true inliers', 'true outliers'],
prop=matplotlib.font_manager.FontProperties(size=10),
loc='lower right')
subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
subplot.set_xlim((-7, 7))
subplot.set_ylim((-7, 7))
plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
plt.suptitle("Outlier detection")
plt.show()
3.2 PyOD
from pyod.models.knn import KNN # imprt kNN分类器
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize
contamination = 0.1 # percentage of outliers
n_train = 200 # number of training points
n_test = 100 # number of testing points
X_train, y_train, X_test, y_test = generate_data(
n_train=n_train, n_test=n_test, contamination=contamination)
# 训练一个kNN检测器
clf_name = 'kNN'
clf = KNN() # 初始化检测器clf
clf.fit(X_train) # 使用X_train训练检测器clf
# 返回训练数据X_train上的异常标签和异常分值
y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常)
# 用训练好的clf来预测未知数据中的异常值
y_test_pred = clf.predict(X_test) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
y_test_scores = clf.decision_function(X_test) # 返回未知数据上的异常值 (分值越大越异常)
# 评估预测结果
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)
# 可视化
visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
y_test_pred, show_figure=True, save_figure=False)
关于我
Tarzan 互联网码农,主要做大数据分析,数据仓库,风控