模型训练

评估器

评估器是需要评估的统计模型，对所观测对象做预测或分类。如果从抽象的评估器类派生，新模型必须实现.fit()方法，该方法用给出的在DataFrame中找到的数据和某些默认或自定义的参数来拟合模型。在PySpark 中，由很多评估器可用，本文以Spark2.2.1中提供的模型。
分类
分类
ML包为数据科学家提供了七种分类（Classification）模型以供选择。

线性回归

class pyspark.ml.regression.LinearRegression(featuresCol=’features’, labelCol=’label’, predictionCol=’prediction’, maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, fitIntercept=True, standardization=True, solver=’auto’, weightCol=None, aggregationDepth=2)

>>> from pyspark.ml.linalg import Vectors
>>> df = spark.createDataFrame([
...     (1.0, 2.0, Vectors.dense(1.0)),
...     (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"])
>>> lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight")
>>> model = lr.fit(df)
>>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
>>> abs(model.transform(test0).head().prediction - (-1.0)) < 0.001
True
>>> abs(model.coefficients[0] - 1.0) < 0.001
True
>>> abs(model.intercept - 0.0) < 0.001
True
>>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
>>> abs(model.transform(test1).head().prediction - 1.0) < 0.001
True
>>> lr.setParams("vector")
Traceback (most recent call last):
    ...
TypeError: Method setParams forces keyword arguments.
>>> lr_path = temp_path + "/lr"
>>> lr.save(lr_path)
>>> lr2 = LinearRegression.load(lr_path)
>>> lr2.getMaxIter()
5
>>> model_path = temp_path + "/lr_model"
>>> model.save(model_path)
>>> model2 = LinearRegressionModel.load(model_path)
>>> model.coefficients[0] == model2.coefficients[0]
True
>>> model.intercept == model2.intercept
True
>>> model.numFeatures
1

逻辑回归

（1）LogisticRegression :逻辑回归,支持多项逻辑（softmax）和二项逻辑回归。

pyspark.ml.classification.LogisticRegression(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol="probability", rawPredictionCol="rawPrediction", standardization=True, weightCol=None, aggregationDepth=2, family="auto")
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol="probability", rawPredictionCol="rawPrediction", standardization=True, weightCol=None, aggregationDepth=2, family="auto")

#-*- coding:utf-8 -*-

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LogisticRegressionModel

if __name__=="__main__":
    sc=SparkContext(appName="myApp")
    spark=SparkSession.builder.getOrCreate()
    df=sc.parallelize([
    Row(label=1.0,weight= 1.0,features=Vectors.dense(0.0, 5.0)),
    Row(label=0.0,weight= 2.0,features=Vectors.dense(1.0, 2.0)),
    Row(label=1.0,weight= 3.0,features=Vectors.dense(2.0, 1.0)),
    Row(label=0.0,weight= 4.0,features=Vectors.dense(3.0, 3.0))
    ]).toDF()
    """
    df.show()
    +---------+-----+------+
    | features|label|weight|
    +---------+-----+------+
    |[0.0,5.0]|  1.0|   1.0|
    |[1.0,2.0]|  0.0|   2.0|
    |[2.0,1.0]|  1.0|   3.0|
    |[3.0,3.0]|  0.0|   4.0|
    +---------+-----+------+
    """
    blor=LogisticRegression(regParam=0.01, weightCol="weight")
    blorModel=blor.fit(df)
    # print(blorModel.coefficients)
    # [-1.08072664359,-0.646290405354]
    # print(blorModel.intercept)
    # 3.1127663191585144
    test0=sc.parallelize([Row(features=Vectors.dense(-1.0, 1.0))]).toDF()
    result=blorModel.transform(test0)
    """
    result.show()
    +----------+--------------------+--------------------+----------+
    |  features|       rawPrediction|         probability|prediction|
    +----------+--------------------+--------------------+----------+
    |[-1.0,1.0]|[-3.5472025573965...|[0.02799860485691...|       1.0|
    +----------+--------------------+--------------------+----------+
    """
    temp_path="/tmp/test"
    lr_path = temp_path + "/lr"
    blor.save(lr_path)
    blorLoad=LogisticRegression.load(lr_path)
    # print(blorLoad.getRegParam())
    # 0.01
    model_path = temp_path + "/lr_model"
    blorModel.save(model_path)
    blorModelLoad=LogisticRegressionModel.load(model_path)
    # print(blorModelLoad.intercept==blorModel.intercept)
    # True

支持向量机

class pyspark.ml.classification.LinearSVC(*args, **kwargs)
这个二元分类器使用OWLQN优化器来优化the Hinge Loss,目前只支持L2正则化。

#-*- coding:utf-8 -*-
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import LinearSVCModel

sc=SparkContext(appName="myApp")
spark=SparkSession.builder.enableHiveSupport().getOrCreate()
df = sc.parallelize([Row(label=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),Row(label=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF()
svm=LinearSVC(maxIter=5, regParam=0.01)
model=svm.fit(df)
print(model.coefficients)
#[0.0,-0.27917116657,-0.183278426036]
print(model.intercept)
#1.0206118982229047
print(model.numClasses)
#2
print(model.numFeatures)
#3
test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 2.0, 3.0))]).toDF()
result=model.transform(test0).head()
print(result.prediction)
#0.0
print(result.rawPrediction)
#[0.0875657130274,-0.0875657130274]

#存储模型的训练参数
temp_path=""
svm_path = temp_path + "/svm"
svm.save(svm_path)
#重载模型训练参数
svm2 = LinearSVC.load(svm_path)
svm2.getMaxIter()
# 5
# 存储训练好的模型
model_path = temp_path + "/svm_model"
model.save(model_path)
#重载训练好的模型
model2=LinearSVCModel.load(model_path)
model.coefficients[0] == model2.coefficients[0]
# True
model.intercept == model2.intercept
# True

决策树

（1）DecisionTreeClassifier：支持二进制和多类标签，以及连续和分类功能

pyspark.ml.classification.DecisionTreeClassifier(self, featuresCol="features", labelCol="label", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", seed=None)
maxDepth参数来限制树的深度；
minInstancesPerNode确定需要进一步拆分的树节点的观察对象的最小数量；maxBins参数指定连续变量将被分割的Bin的最大数量；
impurity指定用于测量并计算来自分割的信息的度量。

（2）DecisionTreeRegressor：支持连续和分类功能。

pyspark.ml.regression.DecisionTreeRegressor(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance", seed=None, varianceCol=None)

>>> from pyspark.ml.linalg import Vectors
>>> df = spark.createDataFrame([
...     (1.0, Vectors.dense(1.0)),
...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
>>> dt = DecisionTreeRegressor(maxDepth=2, varianceCol="variance")
>>> model = dt.fit(df)
>>> model.depth
1
>>> model.numNodes
3
>>> model.featureImportances
SparseVector(1, {0: 1.0})
>>> model.numFeatures
1
>>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
>>> model.transform(test0).head().prediction
0.0
>>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
>>> model.transform(test1).head().prediction
1.0
>>> dtr_path = temp_path + "/dtr"
>>> dt.save(dtr_path)
>>> dt2 = DecisionTreeRegressor.load(dtr_path)
>>> dt2.getMaxDepth()
2
>>> model_path = temp_path + "/dtr_model"
>>> model.save(model_path)
>>> model2 = DecisionTreeRegressionModel.load(model_path)
>>> model.numNodes == model2.numNodes
True
>>> model.depth == model2.depth
True
>>> model.transform(test1).head().variance
0.0

梯度提升决策树模型

（1）GBTClassifier：支持二元标签，以及连续和分类功能,不支持多类标签

用于分类的梯度提升决策树模型。该模型属于集成模型（Ensemble methods）家族。集成模型结合多个弱预测模型而形成一个强健的模型。

pyspark.ml.classification.GBTClassifier(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0)

参数参考：是mllib的接口而还是ml的接口，这里只是作一个参考
trainClassifier参数说明

data - 训练数据集：LabeledPoint的RDD。标签取值{0，1}。
categoricalFeaturesInfo - 存储类别特征的Map。条目（n - > k）表示特征n对应k个类别，类别由{0,1，...，k-1}索引。
loss - 梯度提升中用到的损失函数。支持的值：“logLoss”，“minimumSquaresError”，“minimumAbsoluteError”。（默认值：“logLoss”）
numIterations - 迭代次数。（默认值：100）
learningRate - 学习率。学习率应在间隔（0，1）之间（默认值：0.1）
maxDepth - 树的最大深度（例如深度0表示1个叶节点，深度1表示1个内部节点+ 2个叶节点）。（默认值：3）
maxBins - 用于分割特征的最大bin数量。 DecisionTree需要maxBins> = max类别。（默认值：32）
返回值：GradientBoostedTreesModel 可用于预测。

#官网例子
#-*- coding:utf-8 -*-

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import GBTClassificationModel
from numpy import allclose

# data=[LabeledPoint(0.0,[0.0]),LabeledPoint(0.0,[1.0]),LabeledPoint(1.0,[2.0]),LabeledPoint(1.0,[3.0])]
sc=SparkContext(appName="myApp")
spark=SparkSession.builder.enableHiveSupport().getOrCreate()

df = spark.createDataFrame([(1.0, Vectors.dense(1.0)),(0.0, Vectors.sparse(1, [], []))], ["label", "features"])
# df.show()
stringIndexer=StringIndexer(inputCol="label",outputCol="indexed")
siModel=stringIndexer.fit(df)
td=siModel.transform(df)
gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42)
model = gbt.fit(td)
model.featureImportances
#SparseVector(1, {0: 1.0})
allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1])
#True
test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
model.transform(test0).head().prediction
#0.0
test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
model.transform(test1).head().prediction
#1.0
model.totalNumNodes
#15
print(model.toDebugString)
temp_path="/tmp/test/"
gbtc_path = temp_path + "gbtc"
gbt.save(gbtc_path)
model_path = temp_path + "gbtc_model"
model.save(model_path)


gbt2 = GBTClassifier.load(gbtc_path)
gbt2.getMaxDepth()
#2

model2 = GBTClassificationModel.load(model_path)
model.featureImportances == model2.featureImportances
#True
model.treeWeights == model2.treeWeights
# True
model.trees
# [DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...]

（2）GBTRegressor：梯度提升树（GBT）的回归学习算法,它支持连续和分类功能。

pyspark.ml.regression.GBTRegressor(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, impurity="variance")

trainRegressor参数说明

data - 训练数据集：LabeledPoint的RDD。标签取值{0，1}。
categoricalFeaturesInfo - 存储类别特征的Map。条目（n - > k）表示特征n对应k个类别，类别由{0,1，…，k-1}索引。
loss - 损失函数。支持的值：“logLoss”，“minimumSquaresError”，“minimumAbsoluteError”。（默认值：“leastSquaresError”）
numIterations - 提升次数。（默认值：100）
learningRate - 学习率。学习率应在间隔（0，1）之间（默认值：0.1）
maxDepth - 树的最大深度（例如深度0表示1个叶节点，深度1表示1个内部节点+ 2个叶节点）。（默认值：3）
maxBins - 用于分裂特征的最大bin数量。 DecisionTree需要maxBins> = max类别。（默认值：32）
返回值：GradientBoostedTreesModel可用于预测。

随机森林

（1）RandomForestClassifier：随机森林学习算法的分类。它支持二进制和多类标签，以及连续和分类功能。
该模型产生多个决策树，使用模式输出的决策树来对观察对象进行分类。

pyspark.ml.classification.RandomForestClassifier(self, featuresCol="features", labelCol="label", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0)

#-*- coding:utf-8 -*-

from pyspark import SparkContext
from pyspark.sql import SparkSession
import numpy
from numpy import allclose
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel

sc=SparkContext(appName="myApp")
spark=SparkSession.builder.enableHiveSupport().getOrCreate()
df=spark.createDataFrame([(1.0,Vectors.dense(1.0)),(0.0,Vectors.sparse(1,[],[]))],["label","features"])
stringIndexer=StringIndexer(inputCol="label",outputCol="indexed")
siModel=stringIndexer.fit(df)
td=siModel.transform(df)
rf=RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=42)
model=rf.fit(td)
print(model.featureImportances)
#(1,[0],[1.0])
print(model.treeWeights)
# [1.0, 1.0, 1.0]
print(allclose(model.featureImportances,[1.0, 1.0, 1.0]))
# True
#利用训练的模型对新数据进行预测
test0 = spark.createDataFrame([(Vectors.dense(-1),)],["features"])
result=model.transform(test0).head()
print(result.prediction)
# 0.0
print(numpy.argmax(result.probability))
# 0
print(numpy.argmax(result.rawPrediction))
# 0
print(model.trees)
#[DecisionTreeClassificationModel (uid=dtc_d54f917f8495) of depth 0 with 1 nodes, DecisionTreeClassificationModel (uid=dtc_e5ab92161f67) of depth 1 with 3 nodes, DecisionTreeClassificationModel (uid=dtc_437a7e97c21f) of depth 1 with 3 nodes]
#存储模型参数
temp_path=""
rfc_path = temp_path + "/rfc"
rf.save(rfc_path)
#重载未训练模型
rf2=RandomForestClassifier.load(rfc_path)
print(rf2.getNumTrees())
# 3
#存储训练好的模型
model_path=temp_path + "/rfc_model"
model.save(model_path)
#重载训练好的模型
model2=RandomForestClassificationModel.load(model_path)
print(model.featureImportances==model2.featureImportances)
# True

(2)class pyspark.ml.regression.RandomForestRegressor

#-*-coding:utf-8-*-

from pyspark import SparkContext
from pyspark.sql import SparkSession
from numpy import allclose
import numpy
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import RandomForestRegressionModel

sc=SparkContext(appName="myApp")
spark=SparkSession.builder.enableHiveSupport().getOrCreate()
df=spark.createDataFrame([(1.0,Vectors.dense(1.0)),(0.0,Vectors.sparse(1,[],[]))],["label","features"])
rfr=RandomForestRegressor(numTrees=2, maxDepth=2, seed=42)
model=rfr.fit(df)
print(model.featureImportances)
# (1,[0],[1.0])
print(model.treeWeights)
#[1.0, 1.0]
#利用模型来预测
test0=spark.createDataFrame([(Vectors.dense(-1),)],["features"])
result=model.transform(test0).head()
print(result.prediction)
# 0.0
test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
print(model.transform(test1).head().prediction)
# 0.5
print(model.numFeatures)
# 1
print(model.trees)
#[DecisionTreeRegressionModel (uid=dtr_c2df1ce337bb) of depth 0 with 1 nodes, DecisionTreeRegressionModel (uid=dtr_2c8be86755ae) of depth 1 with 3 nodes]
print(model.getNumTrees)
#2

#存储模型参数
temp_path=""
rfr_path = temp_path + "/rfr"
rfr.save(rfr_path)
#重载模型参数
rfr2=RandomForestRegressor.load(rfr_path)
print(rfr2.getNumTrees)

#存储训练好的模型
model_path = temp_path + "/rfr_model"
model.save(model_path)
#重载训练好的模型
model2=RandomForestRegressionModel.load(model_path)
print(model2.featureImportances==model.featureImportances)
# True

OneVsRest

（5）OneVsRest：多分类问题简化为二分类问题。

pyspark.ml.classification.OneVsRest(self, featuresCol="features", labelCol="label", predictionCol="prediction", classifier=None)

pyspark.ml.classification.NaiveBayes

pyspark.ml.classification.MultilayerPerceptronClassifier

模型调优

朴素贝叶斯

模型评估方法
将其作为多分类结果进行评估，可计算f1、精度、召回率、准确度（见MulticlassClassificationEvaluator源代码）
将其作为二分类结果进行评估，可计算areaUnderROC、areaUnderPR（见BinaryClassificationEvaluator源代码）
或自己计算
模型调优
（1）new NaiveBayes().setThresholds(Array(100.0,1.0))
为每个分类设置一个阈值，参数的长度必须和类的个数相等。最终的分类结果会是p/t最大的那个分类，其中p是通过Bayes计算出来的结果，t是阈值。
这对于训练样本严重不均衡的情况尤其重要，比如分类1只有20W数据，而分类0有2000万数据，此时应用new NaiveBayes().setThresholds(Array(100.0,1.0))

逻辑回归

模型评估方法
将其作为多分类结果进行评估，可计算f1、精度、召回率、准确度（见MulticlassClassificationEvaluator源代码）
将其作为二分类结果进行评估，可计算areaUnderROC、areaUnderPR（见BinaryClassificationEvaluator源代码）
或自己定义
模型调优
这些参数用于设置列名：
setPredictionCol setLabelCol setProbabilityCol setFeaturesCol setWeightCol setRawPredictionCol
以下这些参数用于设置各种学习参数：

（1）setThreshold/setThresholds

setThreshold设置了阈值，大于这个阈值则分类为1，小于则分类为0。转为值为0.5
setThresholds用于多分类的情况。
以下示例如何找到最优的threshold。

（2）setRegParam：正则化参数

默认值为0.
正则化参数主要是为了解决过度拟合的问题，详细理论请参考《逻辑回归原理与实现》

lrModel.getRegParam  // 正则化参数>=0

当设置正则化参数为0～3时，计算检验样本，分别得到召回率与精度如下：

0.0  0.9205683447008687  0.9527194528239897
0.05 0.9239064559263499  0.958818361519877
0.1  0.9223865090282922  0.9569734714653572
0.15 0.9201643081836635  0.9546774724781172
0.2  0.9183653836903926  0.952554380363201
0.3  0.9148060143721561  0.9484744816030162
0.5  0.9084953487700936  0.9416216324007418
3.0  0.8789044838433493  0.9037430510218213

这组数据中可以看出当正则化参数为0.05时，分类效果最优。但对于其它数据可能就会有过拟合的问题了，所以要视样本情况而调整正则化参数。

正则化参数过小，则可能过拟合。过大则可能欠拟合。

（3） setMaxIter

最大的迭代次数，当达到这个次数时，不管是否已经收敛到最小误差，均会结束训练。默认值为100。

（4） setTol

算法的收敛阈值，当小于这个值时，结束迭代计算，默认值为1.0E-6。

（5）setStandardization

是否对特征值进行标准化，默认为true。

（6）setElasticNetParam

默认值为0.0，这是一个 $L2$ 惩罚。用于防止过拟合的另一种方式，理论详见 $《L0/L1/L2》$
对于 $α= 0$ ，惩罚是 $L2$ 惩罚。对于 $\alpha = 1$ ，它是一个 $L1$ 惩罚。对于 $0 <α<1$ ，惩罚是 $L1$ 和 $L2$ 的组合。

（7）setFitIntercept

Param for whether to fit an intercept term.

举个简单情况 $y= w１ * x +ｗ０$ ，这里w0就是一个截距，调节直线不穿过原点。从这个角度想想，w0确实不应该正则化，值是多少就多少。
如果设置为false，则 $intercept=0.0$ ，否则为实际值。

(8) setFamily

这是2.1才引入的参数，可设置二项式还是多项式模型。

评估:pyspark.ml.evaluation module

class pyspark.ml.evaluation.Evaluator
class pyspark.ml.evaluation.BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='label', metricName='areaUnderROC')
class pyspark.ml.evaluation.RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='rmse')
class pyspark.ml.evaluation.MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1')

调优:pyspark.ml.tuning module

ml.tuning模块对参数调优的想法很简单：先生成不同参数的网格，即可能的参数组合;设置评估优劣的标准，比如二分类可用BinaryClassificationEvaluator；然后利用交叉验证来计算不同参数组合下的结果，并取到最好的情况；求最优参数可以将参数当成目标函数的一部分，然后利用梯度下降法之类的进行更新；可以先优化某一个参数，然后在优化另一个，依此类推，甚至很可以反复几次该过程；但ml中很有可能就是逐个参数组合计算的，当然可以想像，这种方式会很耗时；这个过程可以自己找些调优的小技巧。

class pyspark.ml.tuning.ParamGridBuilder
baseOn(*args)
设置参数调整的优化对象，也就是因变量和因变量的预测。可以用字典或列表的方式给出。
addGrid(param, values)
设置调整参数及其取值范围
build()
产生调优参数的所有取值的组合

#-*-coding:utf-8-*-
from pyspark import SparkContext
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder

if __name__=="__main__":
    sc=SparkContext(appName="myApp")
    lr=LogisticRegression()
    paramGrid=ParamGridBuilder()\
        .baseOn({lr.labelCol:'l'})\
        .baseOn([lr.predictionCol,'p'])\
        .addGrid(param=lr.regParam,values=[1,2])\
        .addGrid(param=lr.maxIter,values=[1,5])\
        .build()

    """
    print(paramGrid)
    [
        {
            Param(parent='LogisticRegression_48b998ffadd246fc8558', name='labelCol', doc='label column name.'): 'l', Param(parent='LogisticRegression_48b998ffadd246fc8558', name='regParam', doc='regularization parameter (>= 0).'): 1, Param(parent='LogisticRegression_48b998ffadd246fc8558', name='predictionCol', doc='prediction column name.'): 'p', Param(parent='LogisticRegression_48b998ffadd246fc8558', name='maxIter', doc='max number of iterations (>= 0).'): 1
        }, 
        {
            Param(parent='LogisticRegression_48b998ffadd246fc8558', name='labelCol', doc='label column name.'): 'l', Param(parent='LogisticRegression_48b998ffadd246fc8558', name='regParam', doc='regularization parameter (>= 0).'): 1, Param(parent='LogisticRegression_48b998ffadd246fc8558', name='predictionCol', doc='prediction column name.'): 'p', Param(parent='LogisticRegression_48b998ffadd246fc8558', name='maxIter', doc='max number of iterations (>= 0).'): 5
        }, 
        {
            Param(parent='LogisticRegression_48b998ffadd246fc8558', name='labelCol', doc='label column name.'): 'l', Param(parent='LogisticRegression_48b998ffadd246fc8558', name='regParam', doc='regularization parameter (>= 0).'): 2, Param(parent='LogisticRegression_48b998ffadd246fc8558', name='predictionCol', doc='prediction column name.'): 'p', Param(parent='LogisticRegression_48b998ffadd246fc8558', name='maxIter', doc='max number of iterations (>= 0).'): 1
        }, 
        {
            Param(parent='LogisticRegression_48b998ffadd246fc8558', name='labelCol', doc='label column name.'): 'l', Param(parent='LogisticRegression_48b998ffadd246fc8558', name='regParam', doc='regularization parameter (>= 0).'): 2, Param(parent='LogisticRegression_48b998ffadd246fc8558', name='predictionCol', doc='prediction column name.'): 'p', Param(parent='LogisticRegression_48b998ffadd246fc8558', name='maxIter', doc='max number of iterations (>= 0).'): 5
        }
    ]
    """
    expected = [{lr.regParam: 1.0, lr.maxIter: 1, lr.labelCol: 'l', lr.predictionCol: 'p'},
                {lr.regParam: 2.0, lr.maxIter: 1, lr.labelCol: 'l', lr.predictionCol: 'p'},
                {lr.regParam: 1.0, lr.maxIter: 5, lr.labelCol: 'l', lr.predictionCol: 'p'},
                {lr.regParam: 2.0, lr.maxIter: 5, lr.labelCol: 'l', lr.predictionCol: 'p'}]
    # print(len(paramGrid) == len(expected))
    # True
    # print(all([m in expected for m in paramGrid]))
    # True

class pyspark.ml.tuning.CrossValidator(estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, seed=None)

>>> from pyspark.ml.classification import LogisticRegression
>>> from pyspark.ml.evaluation import BinaryClassificationEvaluator
>>> from pyspark.ml.linalg import Vectors
>>> dataset = spark.createDataFrame(
...     [(Vectors.dense([0.0]), 0.0),
...      (Vectors.dense([0.4]), 1.0),
...      (Vectors.dense([0.5]), 0.0),
...      (Vectors.dense([0.6]), 1.0),
...      (Vectors.dense([1.0]), 1.0)] * 10,
...     ["features", "label"])
>>> lr = LogisticRegression()
>>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
>>> evaluator = BinaryClassificationEvaluator()
>>> cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
...     parallelism=2)
>>> cvModel = cv.fit(dataset)
>>> cvModel.avgMetrics[0]
0.5
>>> evaluator.evaluate(cvModel.transform(dataset))
0.8333...

class pyspark.ml.tuning.TrainValidationSplit(estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75, parallelism=1, seed=None)

与ml.tuning.ml.CrossValidator类似，只不过只会随机分裂出一部分测试样本一次，而不是像交叉验证那么有n次轮换。

>>> from pyspark.ml.classification import LogisticRegression
>>> from pyspark.ml.evaluation import BinaryClassificationEvaluator
>>> from pyspark.ml.linalg import Vectors
>>> dataset = spark.createDataFrame(
...     [(Vectors.dense([0.0]), 0.0),
...      (Vectors.dense([0.4]), 1.0),
...      (Vectors.dense([0.5]), 0.0),
...      (Vectors.dense([0.6]), 1.0),
...      (Vectors.dense([1.0]), 1.0)] * 10,
...     ["features", "label"])
>>> lr = LogisticRegression()
>>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
>>> evaluator = BinaryClassificationEvaluator()
>>> tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
...     parallelism=2)
>>> tvsModel = tvs.fit(dataset)
>>> evaluator.evaluate(tvsModel.transform(dataset))
0.8333...

使用 ML Pipeline 构建机器学习工作流

Step 1
Read the source data file and convert it to be a dataframe with columns named.
Step 2
StringIndexer encodes a string column of labels to a column of label indices.
Step 3
Define a VectorAssembler transformer to transform source features data to be a vector
Step 4
Create model instance and set the input parameters.
Step 5
Convert indexed class labels back to original one so that it can be easily understood when we need to display or save the prediction result to a file.
注意：这一步要用到Step 2的转换器
- Step 6
  Randomly split the input data by 8:2, while 80% is for training, the rest is for testing.
Step 7
Create a ML pipeline which is constructed by for 4 PipelineStage objects.and then call fit method to perform defined operations on training data.
*Step 8
Perform predictions about testing data. This transform method will return a result DataFrame with new prediction column appended towards previous DataFrame.
Step 9
Select features,label,and predicted label from the DataFrame to display.
Step 10
The evaluator code is used to compute the prediction accuracy, this is usually a valuable feature to estimate prediction accuracy the trained model.

来源：王龙 2015 年 11 月 02 日发布

引言

使用机器学习 (Machine Learning) 技术和方法来解决实际问题，已经被成功应用到多个领域，我们经常能够看到的实例有个性推荐系统，金融反欺诈，自然语言处理和机器翻译，模式识别，智能控制等。一个典型的机器学习机器学习过程通常会包含：源数据 ETL，数据预处理，指标提取，模型训练与交叉验证，新数据预测等。我们可以看到这是一个包含多个步骤的流水线式工作，也就是说数据从收集开始，要经历多个步骤，才能得到我们需要的输出。在本系列第 4 部分已经向大家介绍了 Spark MLlib 机器学习库, 虽然 MLlib 已经足够简单易用，但是如果目标数据集结构复杂需要多次处理，或者是对新数据进行预测的时候需要结合多个已经训练好的单个模型进行综合预测 (集成学习的思想)，那么使用 MLlib 将会让程序结构复杂，难于理解和实现。值得庆幸的是，在 Spark 的生态系统里，一个可以用于构建复杂机器学习工作流应用的新库已经出现了，它就是 Spark 1.2 版本之后引入的 ML Pipeline，经过几个版本的发展，截止目前的 1.5.1 版本已经变得足够稳定易用了。本文将向读者详细地介绍 Spark ML Pipeline 的设计思想和基本概念，以及如何使用 ML Pipeline 提供的 API 库编写一个解决分类预测问题的 Pipeline 式应用程序。相信通过本文的学习，读者可以较为深入的理解 ML Pipeline，进而将它推广和应用到更多复杂问题的解决方案上去。

关于 ML Pipeline

Spark ML Pipeline 的出现，是受到了 scikit-learn 项目的启发，并且总结了 MLlib 在处理复杂机器学习问题上的弊端，旨在向用户提供基于 DataFrame 之上的更加高层次的 API 库，以更加方便的构建复杂的机器学习工作流式应用。一个 Pipeline 在结构上会包含一个或多个 PipelineStage，每一个 PipelineStage 都会完成一个任务，如数据集处理转化，模型训练，参数设置或数据预测等，这样的 PipelineStage 在 ML 里按照处理问题类型的不同都有相应的定义和实现。接下来，我们先来了解几个重要概念。

DataFrame

关于 DataFrame 其实我们已经在本系列第 3 部分介绍过了，它较之 RDD，包含了 schema 信息，更类似传统数据库中的二维表格。它被 ML Pipeline 用来存储源数据。

DataFrame 可以被用来保存各种类型的数据，如我们可以把特征向量存储在 DataFrame 的一列中，这样用起来是非常方便的。

Transformer

Transformer 中文可以被翻译成转换器，是一个 PipelineStage，实现上也是继承自 PipelineStage 类，主要是用来把一个 DataFrame 转换成另一个 DataFrame，比如一个模型就是一个 Transformer，因为它可以把一个不包含预测标签的测试数据集 DataFrame 打上标签转化成另一个包含预测标签的 DataFrame，显然这样的结果集可以被用来做分析结果的可视化。

Estimator

Estimator 中文可以被翻译成评估器或适配器，在 Pipeline 里通常是被用来操作 DataFrame 数据并生产一个 Transformer，如一个随机森林算法就是一个 Estimator，因为它可以通过训练特征数据而得到一个随机森林模型。实现上 Estimator 也是继承自 PipelineStage 类。

Parameter

Parameter 被用来设置 Transformer 或者 Estimator 的参数。

要构建一个 Pipeline，首先我们需要定义 Pipeline 中的各个 PipelineStage，如指标提取和转换模型训练等。有了这些处理特定问题的 Transformer 和 Estimator，我们就可以按照具体的处理逻辑来有序的组织 PipelineStages 并创建一个 Pipeline，如 val pipeline = new Pipeline().setStages(Array(stage1,stage2,stage3,…))。然后就可以把训练数据集作为入参并调用 Pipelin 实例的 fit 方法来开始以流的方式来处理源训练数据，这个调用会返回一个 PipelineModel 类实例，进而被用来预测测试数据的标签，它是一个 Transformer。

随机森林及 ML 的实现

随机森林构建于决策树之上，顾名思义，就是随机的构建一个包含多个决策树的森林。随机森林里的决策树之间是独立的，在随机森林模型构建好以后，对于新来的测试样本数据，随机森林模型会让其中的每个决策树分别做一次预测，然后统计出现此处最多的预测标签，并将它作为最终的预测标签。随机森林算法运用的就是集成学习的思想，在实践中，随机森林往往都有很好表现，并且多次预测结果稳定并且精度非常高，也不容易出现过拟合的问题。也是笔者最喜欢并且最常用的一种机器学习算法。

本文并不会重点介绍随机森林的基本理论，因为网上这样的文章已经很多了，本文将把重点放在对 Spark ML 中随机森林的实现以及可调参数的介绍。关于随机森林算法的详细介绍大家可以参考维基百科上的随机森林介绍。

Spark ML 中随机森林实现是在 RandomForestClassifier 类中，位于 org.apache.spark.ml. classification 包中，该实现中支持设置的主要参数如下：

官网文档中RandomForestClassifier 的setParams函数如下，我们可以从中查看RandomForestClassifier 可以设置哪些模型的参数，及各参数的默认值
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0)

关于参数的详细内容见官网

具体解释如下：
* featuresCol

训练数据集 DataFrame 中存储特征数据的列名。默认值为”features”，所有在数据准备的时候我们需要将相应标签设置了”features”或者替换掉featuresCol默认值

labelCol

标签列的名称。默认值为”label”

impurity

树节点选择的不纯度的衡量指标，取值可以是”entroy”或”gini”，默认是”gini”。

maxBins

离散连续性变量时最大的分箱数，默认是 32。理论上箱数越大粒度就越细，但是针对特定的数据集总有一个合理的箱数。

maxDepth

树的最大深度，默认值是 5。

numTrees

随机森林需要训练的树的个数，默认值是 20。

predictionCol

算法预测结果的存储列的名称, 默认是”prediction”。

rawPredictionCol

原始的算法预测结果的存储列的名称, 默认是”rawPrediction”

probabilityCol

类别预测结果的条件概率值存储列的名称, 默认值是”probability”

在后文中大家可以看到如何在程序中设置这些参数。可以调用 RandomForestClassifier.setXXX 方法或者在 ParamMap 里设定参数，然后再调用 RandomForestClassifier.fit 方法时传入 ParamMap 实例，如：
这里写图片描述

RandomForestClassifier 的 fit 方法从源头上来讲，是来自 Predictor 类 (Estimator 的子类)，Predictor 类的 fit 方法设计和实现上实际上是采用了模板方法的设计模式，具体会调用实现类的 train 方法

图 1. Predictor 类的 fit 方法实现预览
这里写图片描述

所以对于 RandomForestClassifier 类我们最需要关注的就是 train 方法，其中包含具体从源数据 DataFrame 训练一个随机森林模型的过程。train 方法在提取出 DataFrame 数据集中的 label 和 features 数据之后，进一步调用 RandomForest.run 方法去真正的开始训练随机森林模型，训练结束后会返回一个 RandomForestClassificationModel 类实例，这是一个 Transformer，会被用来预测测试数据集。

图 2. RandomForestClassifier 类的 train 方法实现预览
这里写图片描述
对于 RandomForest 类的 run 方法的具体实现逻辑，已经在 developerWorks 的“Spark 随机森林算法原理、源码分析及案例实战” 一文中有详细介绍，为了避免内容冲突，本文的内容将重点放在 ML Pipeline 的实现层次关系上，在这里不做赘述。

目标数据集预览

本文所使用的测试数据集来自 UCI 的 banknote authentication data set ，这是一个从纸币鉴别过程中的图片里提取的数据集，总共包含五个列，前 4 列是指标值 (连续型)，最后一列是真假标识。

图 3. 测试数据集格式
这里写图片描述
四列依次是小波变换图像的方差，小波变换图像的偏态，小波变换图像的峰度，图像熵，类别标签。其实读者并不需要知道什么是小波变换及其相关改变，只需要知道这是四个特征指标的值，我们将根据这些指标训练模型使用模型预测类别。对于该数据集的更多信息，读者可以参考 UCI 官网的描述。
为了便于在分布式环境下运行，我先将.txt文件的数据保存到hdfs上。

案例分析与编码实现

前面提到，本文的目的是使用 Spark ML Pipeline 构建一个对目标数据集进行分类预测的机器学习工作流，案例背景已经相当清晰，在了解了数据集本身和 ML Pipeline 的相关知识后，接下来就是编程实现了。关于实现基本思路和关键的 11 个步骤笔者已经在代码中做了详细解释，为了方便读者理解，这里特别的把该实例的 Pipeline 里包含的 4 个 Stage 重点介绍下。

这四个 Stage 分别对应代码注释里的步骤 2-5，作用如下：

第一个，使用 StringIndexer 去把源数据里的字符 Label，按照 Label 出现的频次对其进行序列编码, 如，0,1,2，…。在本例的数据中，可能这个步骤的作用不甚明显，因为我们的数据格式良好，Label 本身也只有两种，并且已经是类序列编码的”0”和”1”格式。但是对于多分类问题或者是 Label 本身是字符串的编码方式，如”High”,”Low”,”Medium”等，那么这个步骤就很有用，转换后的格式，才能被 Spark 更好的处理。

第二个，使用 VectorAssembler 从源数据中提取特征指标数据，这是一个比较典型且通用的步骤，因为我们的原始数据集里，经常会包含一些非指标数据，如 ID，Description 等。

第三个，创建一个随机森林分类器 RandomForestClassifier 实例，并设定相关参数，主要是告诉随机森林算法输入 DataFrame 数据里哪个列是特征向量，哪个是类别标识，并告诉随机森林分类器训练 5 棵独立的子树。

第四个，我们使用 IndexToString Transformer 去把之前的序列编码后的 Label 转化成原始的 Label，恢复之前的可读性比较高的 Label，这样不论是存储还是显示模型的测试结果，可读性都会比较高。

这几个 Stage 都会被用来构建 Pipeline 实例，并且会按照顺序执行，最终我们根据得到的 PipelineModel 实例，进一步调用其 transform 方法，去用训练好的模型预测测试数据集的分类。

清单 1. 示例程序源代码

#-*-coding:utf-8-*-
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import IndexToString
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.param import Param
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator




if __name__=="__main__":
    sc=SparkContext(appName="Classification with ML Pipeline")
    spark=SparkSession.builder.enableHiveSupport().getOrCreate()
    """
    /** Step 1
     * Read the source data file and convert it to be a dataframe with columns named.
     * 3.6216,8.6661,-2.8073,-0.44699,0
     * 4.5459,8.1674,-2.4586,-1.4621,0
     * 3.866,-2.6383,1.9242,0.10645,0
     * 3.4566,9.5228,-4.0112,-3.5944,0
     * 0.32924,-4.4552,4.5718,-0.9888,0
     * ... ...
     */
    """
    dataPath='/tmp/test/data_banknote_authentication.txt'
    parsedRDD=sc.textFile(dataPath)
    rddData=parsedRDD.map(lambda x:x.split(',')).map(lambda x:(float(x[0]),float(x[1]),float(x[2]),float(x[3]),int(x[4])))
    # print(rddData.collect())
    df=spark.createDataFrame(rddData).toDF("f0","f1","f2","f3","label").cache()
    # df.show()
    """
    +-------+-------+--------+--------+-----+
    | f0 | f1 | f2 | f3 | label |
    +-------+-------+--------+--------+-----+
    | 3.6216 | 8.6661 | -2.8073 | -0.44699 | 0 |
    | 4.5459 | 8.1674 | -2.4586 | -1.4621 | 0 |
    | 3.866 | -2.6383 | 1.9242 | 0.10645 | 0 |
    | 3.4566 | 9.5228 | -4.0112 | -3.5944 | 0 |
    | 0.32924 | -4.4552 | 4.5718 | -0.9888 | 0 |
    | 4.3684 | 9.6718 | -3.9606 | -3.1625 | 0 |
    | 3.5912 | 3.0129 | 0.72888 | 0.56421 | 0 |
    | 2.0922 | -6.81 | 8.4636 | -0.60216 | 0 |
    | 3.2032 | 5.7588 | -0.75345 | -0.61251 | 0 |
    | 1.5356 | 9.1772 | -2.2718 | -0.73535 | 0 |
    | 1.2247 | 8.7779 | -2.2135 | -0.80647 | 0 |
    | 3.9899 | -2.7066 | 2.3946 | 0.86291 | 0 |
    | 1.8993 | 7.6625 | 0.15394 | -3.1108 | 0 |
    | -1.5768 | 10.843 | 2.5462 | -2.9362 | 0 |
    | 3.404 | 8.7261 | -2.9915 | -0.57242 | 0 |
    | 4.6765 | -3.3895 | 3.4896 | 1.4771 | 0 |
    | 2.6719 | 3.0646 | 0.37158 | 0.58619 | 0 |
    | 0.80355 | 2.8473 | 4.3439 | 0.6017 | 0 |
    | 1.4479 | -4.8794 | 8.3428 | -2.1086 | 0 |
    | 5.2423 | 11.0272 | -4.353 | -4.1013 | 0 |
    +-------+-------+--------+--------+-----+
    """

    """
    /** *
     * Step 2
     * StringIndexer encodes a string column of labels
     * to a column of label indices. The indices are in [0, numLabels),
     * ordered by label frequencies.
     * This can help detect label in raw data and give it an index automatically.
     * So that it can be easily processed by existing spark machine learning algorithms.
     * */
    """
    stringIndexer=StringIndexer(inputCol="label",outputCol="indexedLabel").fit(df)

    """
    /**
     * Step 3
     * Define a VectorAssembler transformer to transform source features data to be a vector
     * This is helpful when raw input data contains non-feature columns, and it is common for
     * such a input data file to contain columns such as "ID", "Date", etc.
     */
    """
    vectorAssembler=VectorAssembler(inputCols=["f0","f1","f2","f3"],outputCol="featureVector")


    """
    /**
     * Step 4
     * Create RandomForestClassifier instance and set the input parameters.
     * Here we will use 5 trees Random Forest to train on input data.
     */
    """
    rfClassifier=RandomForestClassifier(featuresCol="featureVector",labelCol="label",numTrees=5)

    """
    /**
     * Step 5
     * Convert indexed class labels back to original one so that it can be easily understood when we
     * need to display or save the prediction result to a file.
     */
    """
    indexToString=IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=stringIndexer.labels)

    """
    /**
     * Step 6
     * Randomly split the input data by 8:2, while 80% is for training, the rest is for testing.
     */
    """
    [trainData,testData]=df.randomSplit([0.8,0.2])

    """
    /**
     * Step 7
     * Create a ML pipeline which is constructed by for 4 PipelineStage objects.
     * and then call fit method to perform defined operations on training data.
     */
    """
    pipeline=Pipeline(stages=[stringIndexer,vectorAssembler,rfClassifier,indexToString])
    model=pipeline.fit(trainData)

    """
    /**
     *Step 8
     *Perform predictions about testing data. This transform method will return a result DataFrame
     *with new prediction column appended towards previous DataFrame.
     *
     * */
    """
    predictionResultDF=model.transform(testData)

    """
    /**
     * Step 9
     * Select features,label,and predicted label from the DataFrame to display.
     * We only show 20 rows, it is just for reference.
     */
    """
    # predictionResultDF.select("f0","f1","f2","f3","label","predictedLabel").show()
    """
    +--------+-------+-------+---------+-----+--------------+
    | f0 | f1 | f2 | f3 | label | predictedLabel |
    +--------+-------+-------+---------+-----+--------------+
    | -2.7419 | 11.4038 | 2.5394 | -5.5793 | 0 | 0 |
    | -2.6989 | 12.1984 | 0.67661 | -8.5482 | 0 | 0 |
    | -2.4604 | 12.7302 | 0.91738 | -7.6418 | 0 | 0 |
    | -2.0759 | 10.8223 | 2.6439 | -4.837 | 0 | 0 |
    | -1.9458 | 11.2217 | 1.9079 | -3.4405 | 0 | 0 |
    | -1.8584 | 7.886 | -1.6643 | -1.8384 | 0 | 0 |
    | -1.8584 | 7.886 | -1.6643 | -1.8384 | 0 | 0 |
    | -1.8411 | 10.8306 | 2.769 | -3.0901 | 0 | 0 |
    | -1.8348 | 11.0334 | 3.1863 | -4.8888 | 0 | 0 |
    | -1.4572 | 9.1214 | 1.7425 | -5.1241 | 0 | 0 |
    | -1.1391 | 1.8127 | 6.9144 | 0.70127 | 0 | 1 |
    | -1.1193 | 10.7271 | 2.0938 | -5.6504 | 0 | 0 |
    | -1.0401 | 9.3987 | 0.85998 | -5.3336 | 0 | 0 |
    | -0.96511 | 9.4111 | 1.7305 | -4.8629 | 0 | 0 |
    | -0.78689 | 9.5663 | -3.7867 | -7.5034 | 0 | 0 |
    | -0.16735 | 7.6274 | 1.2061 | -3.6241 | 0 | 0 |
    | -0.16682 | 5.8974 | 0.49839 | -0.70044 | 0 | 0 |
    | -0.11996 | 6.8741 | 0.91995 | -0.6694 | 0 | 0 |
    | -0.11783 | -1.5789 | 8.03 | -0.028031 | 0 | 0 |
    | 0.045304 | 6.7334 | 1.0708 | -0.9332 | 0 | 0 |
    +--------+-------+-------+---------+-----+--------------+
    """

    """
    /**
     * Step 10
     * The evaluator code is used to compute the prediction accuracy, this is
     * usually a valuable feature to estimate prediction accuracy the trained model.
     */
    """
    evaluator=MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label",metricName="accuracy")
    predictionAccuracy=evaluator.evaluate(predictionResultDF)
    # print("Testing Error=%f" %(1-predictionAccuracy))
    # #Testing Error = 0.039146
    """
    /**
     * Step 11(Optional)
     * You can choose to print or save the the model structure.
     */
    """
    randomForestModel=model.stages[2]
    print("Trained Random Forest Model is:\n %s" %(randomForestModel.toDebugString))
    modelPath="/tmp/test/randomForestModel"
    randomForestModel.save(modelPath)

运行示例程序

spark-submit --master yarn --executor-memory 6g ./pipelineRF.py

部分运行结果：

Trained Random Forest Model is:                                                 
 RandomForestClassificationModel (uid=RandomForestClassifier_42afa6b3e84c15b7747c) with 5 trees
  Tree 0 (weight 1.0):
    If (feature 0 <= 0.26517)
     If (feature 0 <= -0.33729)
      If (feature 1 <= 9.5663)
       If (feature 1 <= 7.2673)
        If (feature 1 <= -1.7837)
         Predict: 1.0
        Else (feature 1 > -1.7837)
         Predict: 1.0
       Else (feature 1 > 7.2673)
        If (feature 0 <= -4.6765)
         Predict: 1.0
        Else (feature 0 > -4.6765)
         Predict: 0.0
      Else (feature 1 > 9.5663)
       If (feature 0 <= -4.6765)
        Predict: 1.0
       Else (feature 0 > -4.6765)
        Predict: 0.0
     Else (feature 0 > -0.33729)
      If (feature 3 <= -0.67975)
       If (feature 1 <= 4.0537)
        Predict: 1.0
       Else (feature 1 > 4.0537)
        Predict: 0.0
      Else (feature 3 > -0.67975)
       If (feature 2 <= 2.4408)
        Predict: 1.0
       Else (feature 2 > 2.4408)
        Predict: 0.0
    Else (feature 0 > 0.26517)
     If (feature 1 <= 5.2684)
      If (feature 2 <= -1.4501)
       If (feature 2 <= -3.1749)
        Predict: 1.0
       Else (feature 2 > -3.1749)
        If (feature 3 <= 0.16076)
         Predict: 1.0
        Else (feature 3 > 0.16076)
         Predict: 1.0
      Else (feature 2 > -1.4501)
       If (feature 0 <= 0.7057)
        If (feature 2 <= 0.32274)
         Predict: 1.0
        Else (feature 2 > 0.32274)
         Predict: 0.0
       Else (feature 0 > 0.7057)
        Predict: 0.0
     Else (feature 1 > 5.2684)
      Predict: 0.0
  Tree 1 (weight 1.0):
    If (feature 0 <= 0.7057)
    ……

注意事项

本文的目标数据集结构其实并不复杂，之所以用 ML Pipeline 的方式实现训练和预测过程是为了向大家展示 ML Pipeline 的用法，这样的简单数据集也更有利于读者掌握 ML Pipeline 的处理过程。
ML Pipeline 提供了大量做特征数据提取和转换的工具，具体参考这里。
使用 Spark 解决机器学习问题，我们通常需要先了解相关算法的原理，然后学习 Spark 相关实现的可调参数，测试过程中，可以针对数据集特点多尝试几种算法，并多做模型的交叉验证。
本文所使用的数据集数据量很小，所以可能反映不了 Spark 处理大数据的优势，读者如果有更大量的数据集，即可对本文程序做少许修改便可以使用在新的数据集上，以测试并了解更多的实现细节。

总结

本文向读者较为详细的介绍了 ML Pipeline 的基本概念和编程实现步骤，大家可以看到，较之 MLlib，ML Pipeline 在结构和逻辑层次上确实是更加清晰了。但是我认为 MLlib 对于处理结构相对简单的数据集其实依然具有优势，可能刚开始更容易被理解接受。另外从 Spark 的学习曲线上来讲，刚开始大家接触的都是 RDD，对 DataFrame 不甚了解，所以对于初学者对 MLlib 的方式其实更容易接受。所以，应该说 MLlib 和 ML Pipeline 都有各自的优势吧。当然，这更多是我个人的理解。希望这篇文章可以对大家学习 ML Pipeline 有帮助，在阅读过程中，有任何不懂或者发现任何问题，请留下您的评论，我会第一时间回答，这样也是一个交流学习的过程，非常感谢。
相关主题

参考 Spark ML Pipeline 官方文档，了解基本理论和方法。
参考 Spark 官方 API 文档，了解相关 API 的用法。
developerWorks 开源技术主题：查找丰富的操作信息、工具和项目更新，帮助您掌握开源技术并将其用于 IBM 产品。

关于pipeline参数调优可参考：http://spark.apache.org/docs/latest/ml-tuning.html

Spark MLlib中的机器学习

逻辑回归算法原理及Spark MLlib调用实例

逻辑回归是一个流行的二分类问题预测方法。
二分类逻辑回归可以扩展为多分类逻辑回归来训练和预测多类别分类问题。如一个分类问题有K种可能结果，我们可以选取其中一种结果作为“中心点“，其他K－1个结果分别视为中心点结果的对立点。在spark.mllib中，取第一个类别为中心点类别。

目前spark.ml逻辑回归工具仅支持二分类问题，多分类回归将在未来完善。
当使用无拦截的连续非零列训练LogisticRegressionModel时，Spark MLlib为连续非零列输出零系数。这种处理不同于libsvm与R glmnet相似。

参数：

elasticNetParam：
类型：双精度型。
含义：弹性网络混合参数，范围[0,1]。
featuresCol:
类型：字符串型。
含义：特征列名。
fitIntercept:
类型：布尔型。
含义：是否训练拦截对象。
labelCol:
类型：字符串型。
含义：标签列名。
maxIter:
类型：整数型。
含义：最多迭代次数（>=0）。
predictionCol:
类型：字符串型。
含义：预测结果列名。
probabilityCol:
类型：字符串型。
含义：用以预测类别条件概率的列名。
regParam:
类型：双精度型。
含义：正则化参数（>=0）。
standardization:
类型：布尔型。
含义：训练模型前是否需要对训练特征进行标准化处理。
threshold:
类型：双精度型。
含义：二分类预测的阀值，范围[0,1]。
thresholds:
类型：双精度数组型。
含义：多分类预测的阀值，以调整预测结果在各个类别的概率。
tol:
类型：双精度型。
含义：迭代算法的收敛性。
weightCol:
类型：字符串型。
含义：列权重。

例子

例子1

from pyspark.ml.classification import LogisticRegression  

# Load training data  
training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")  

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)  

# Fit the model  
lrModel = lr.fit(training)  

# Print the coefficients and intercept for logistic regression  
print("Coefficients: " + str(lrModel.coefficients))  
print("Intercept: " + str(lrModel.intercept))

例子2

# -*- coding: utf-8 -*-
#spark1.6.3版本
from pyspark import SparkContext
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

sc=SparkContext()
data = sc.textFile("/tmp/test/sample_svm_data.txt")#文件存在hdfs上
print(data.collect())
#将RDD转换为LabeledPoint形式
parsedData = data.map(parsePoint)

# Build the model
model = LogisticRegressionWithLBFGS.train(parsedData)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

# Save and load model(存到hdfs上去了)
#由于入hdfs上写文件的时候，hdfs不能覆盖，所以先执行hadoop fs -rm -r /tmp/test/pythonLogisticRegressionWithLBFGSModel删除数据
model.save(sc, "/tmp/test/pythonLogisticRegressionWithLBFGSModel")
sameModel = LogisticRegressionModel.load(sc, "/tmp/test/pythonLogisticRegressionWithLBFGSModel")
labelsAndPreds2 = parsedData.map(lambda p: (p.label, sameModel.predict(p.features)))
trainErr2 = labelsAndPreds2.filter(lambda lp: lp[0] != lp[1]).count() / float(parsedData.count())
print("Training Error = " + str(trainErr2))

保存和加载模型

上面的例子中展示了保存训练好的模型至hdfs及从hdfs加载模型的过程，mllib自带的很多算法都有保存和加载的接口。
但是加注意，如果hdfs上已经存在了同名的模型，那么保存会失败（离线训练会不断地更新模型）；此时我们要先删除hdfs上的模型，具体如何在python中操作hdfs文件系统可参考另一篇文章。

其它资料
Machine Learning Library (MLlib) Guide
mllib分类器编程接口
 Spark机器学习库（MLlib）官方指南手册中文版
 Spark MLlib — Word2Vec
MLlib里几个简单的分类模型(python)
PySpark 学习笔记四:Introducing MLlib
Spark MLlib Statistics统计

数据挖掘工具---spark使用练习---ml(二)

模型训练

评估器

线性回归

逻辑回归

支持向量机

决策树

梯度提升决策树模型

随机森林

OneVsRest

pyspark.ml.classification.NaiveBayes

pyspark.ml.classification.MultilayerPerceptronClassifier

模型调优

朴素贝叶斯

逻辑回归

评估:pyspark.ml.evaluation module

调优:pyspark.ml.tuning module

使用 ML Pipeline 构建机器学习工作流

引言

关于 ML Pipeline

随机森林及 ML 的实现

目标数据集预览

案例分析与编码实现

运行示例程序

注意事项

总结

Spark MLlib中的机器学习

逻辑回归算法原理及Spark MLlib调用实例

参数：

例子

保存和加载模型

猜你喜欢