以下代码块直接调用即可
from pyspark.ml.feature import StringIndexer, StringIndexerModel
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import MinMaxScaler as MinMaxScalerSpark, VectorAssembler
Label编码代码
# label编码
def labelEncodeDF(df, inputColumn, outputColumn, savePath, flag=True):
'''
label编码
:param df: 数据框
:param inputColumn: 待转换列名
:param outputColumn: 编码后列名
:param savePath: 编码器保存路径
:param flag: 是否保存
:return:
'''
stringIndexer = StringIndexer(inputCol=inputColumn, outputCol=outputColumn).setHandleInvalid("keep")
label_model = stringIndexer.fit(df)
df = label_model.transform(df)
if flag:
label_model.write().overwrite().save(savePath)
return df
df = ss.createDataFrame([(2, "iphone"), (11, "小米"), (22, "huawei"), (33, "a锤子"), (66, "小米"), (50, "iphone")], ["id", "value"])
df.show()
df = labelEncodeDF(df, 'value', 'valueEncode', DICT_HDFSPATH + 'user_modelMake.encode', True)
df.show()
df1 = ss.createDataFrame([
(12, "iphone"),
], ["id", "value"])
df2 = ss.createDataFrame([
(22, "鸿蒙"),
], ["id", "value"])
df3 = ss.createDataFrame([
(22, "鸿蒙hos123"),
], ["id", "value"])
df4 = ss.createDataFrame([
(22, "鸿蒙hos123"),
(22, "鸿蒙hos"),
(12, "iphone"),
(11, "小米")
], ["id", "value"])
labelmodel = StringIndexerModel.load(DICT_HDFSPATH + 'user_modelMake.encode')
labelmodel.transform(df1).show()
labelmodel.transform(df2).show()
labelmodel.transform(df3).show()
labelmodel.transform(df4).show()
+---+------+
| id| value|
+---+------+
| 2|iphone|
| 11| 小米|
| 22|huawei|
| 33| a锤子|
| 66| 小米|
| 50|iphone|
+---+------+
+---+------+-----------+
| id| value|valueEncode|
+---+------+-----------+
| 2|iphone| 1.0|
| 11| 小米| 0.0|
| 22|huawei| 2.0|
| 33| a锤子| 3.0|
| 66| 小米| 0.0|
| 50|iphone| 1.0|
+---+------+-----------+
+---+------+-----------+
| id| value|valueEncode|
+---+------+-----------+
| 12|iphone| 1.0|
+---+------+-----------+
+---+-----+-----------+
| id|value|valueEncode|
+---+-----+-----------+
| 22| 鸿蒙| 4.0|
+---+-----+-----------+
+---+----------+-----------+
| id| value|valueEncode|
+---+----------+-----------+
| 22|鸿蒙hos123| 4.0|
+---+----------+-----------+
+---+----------+-----------+
| id| value|valueEncode|
+---+----------+-----------+
| 22|鸿蒙hos123| 4.0|
| 22| 鸿蒙hos| 4.0|
| 12| iphone| 1.0|
| 11| 小米| 0.0|
+---+----------+-----------+
归一化代码
def normalizedDF(df, inputColumn, outputColumn, savePath, minScope=0.0, maxScope=1.0, flag=True):
'''
归一化
:param df: 数据框,对某列或某几列进行归一化
:param minScope: 归一化范围
:param maxScope: 归一化范围,默认[0, 1]
:param inputColumn: 待转换列名
:param outputColumn: 换换后列名
:param savePath: 编码器保存路径
:param flag: 是否保存
:return:
'''
# 向量化
vacal = VectorAssembler(handleInvalid="keep").setInputCols([inputColumn]).setOutputCol(inputColumn + '_vec')
mmscala = MinMaxScalerSpark(min=minScope, max=maxScope, inputCol=inputColumn + '_vec', outputCol=outputColumn)
pipeline = Pipeline(stages=[vacal, mmscala])
model = pipeline.fit(df)
mmsdf = model.transform(df)
udf_mm = fn.udf(lambda x: float(x[0]))
mmsdf = mmsdf.withColumn(outputColumn, udf_mm(fn.col(outputColumn)))
if flag:
model.write().overwrite().save(savePath)
return mmsdf
df = ss.createDataFrame([(2, 98),(11, 100),(22, 3),(33, 1),(66, 60),(50, 20)], ["id", "value"])
df.show()
df = normalizedDF(df, 'value', 'value_nor', DICT_HDFSPATH + 'value.normalized', 0.0, 10.0, True)
df.show()
mmmodel = PipelineModel.load(DICT_HDFSPATH + 'value.normalized')
udf_mm = fn.udf(lambda x: float(x[0]))
df1 = ss.createDataFrame([
(12, 30),
], ["id", "value"])
df1.show()
df2 = ss.createDataFrame([
(12, 300),
(12, 30),
(12, 12),
(12, 8),
], ["id", "value"])
df2.show()
df1 = mmmodel.transform(df1)
df1.withColumn('value_nor', udf_mm(fn.col('value_nor'))).show()
df2 = mmmodel.transform(df2)
df2.withColumn('value_nor', udf_mm(fn.col('value_nor'))).show()
+---+-----+
| id|value|
+---+-----+
| 2| 98|
| 11| 100|
| 22| 3|
| 33| 1|
| 66| 60|
| 50| 20|
+---+-----+
+---+-----+---------+-------------------+
| id|value|value_vec| value_nor|
+---+-----+---------+-------------------+
| 2| 98| [98.0]| 9.797979797979798|
| 11| 100| [100.0]| 10.0|
| 22| 3| [3.0]|0.20202020202020204|
| 33| 1| [1.0]| 0.0|
| 66| 60| [60.0]| 5.959595959595959|
| 50| 20| [20.0]| 1.9191919191919191|
+---+-----+---------+-------------------+
+---+-----+
| id|value|
+---+-----+
| 12| 30|
+---+-----+
+---+-----+
| id|value|
+---+-----+
| 12| 300|
| 12| 30|
| 12| 12|
| 12| 8|
+---+-----+
+---+-----+---------+-----------------+
| id|value|value_vec| value_nor|
+---+-----+---------+-----------------+
| 12| 30| [30.0]|2.929292929292929|
+---+-----+---------+-----------------+
+---+-----+---------+------------------+
| id|value|value_vec| value_nor|
+---+-----+---------+------------------+
| 12| 300| [300.0]|30.202020202020204|
| 12| 30| [30.0]| 2.929292929292929|
| 12| 12| [12.0]|1.1111111111111112|
| 12| 8| [8.0]|0.7070707070707071|
+---+-----+---------+------------------+