pyspark --- 归一化&Label编码

以下代码块直接调用即可

from pyspark.ml.feature import StringIndexer, StringIndexerModel
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import MinMaxScaler as MinMaxScalerSpark, VectorAssembler

Label编码代码

# label编码
def labelEncodeDF(df, inputColumn, outputColumn, savePath, flag=True):
    '''
    label编码
    :param df: 数据框
    :param inputColumn: 待转换列名
    :param outputColumn: 编码后列名
    :param savePath: 编码器保存路径
    :param flag: 是否保存
    :return:
    '''
    stringIndexer = StringIndexer(inputCol=inputColumn, outputCol=outputColumn).setHandleInvalid("keep")
    label_model = stringIndexer.fit(df)
    df = label_model.transform(df)

    if flag:
        label_model.write().overwrite().save(savePath)

    return df
df = ss.createDataFrame([(2, "iphone"), (11, "小米"), (22, "huawei"), (33, "a锤子"), (66, "小米"), (50, "iphone")], ["id", "value"])
df.show()
df = labelEncodeDF(df, 'value', 'valueEncode', DICT_HDFSPATH + 'user_modelMake.encode', True)
df.show()

df1 = ss.createDataFrame([
    (12, "iphone"),
], ["id", "value"])
df2 = ss.createDataFrame([
    (22, "鸿蒙"),
], ["id", "value"])
df3 = ss.createDataFrame([
    (22, "鸿蒙hos123"),
], ["id", "value"])
df4 = ss.createDataFrame([
    (22, "鸿蒙hos123"),
    (22, "鸿蒙hos"),
    (12, "iphone"),
    (11, "小米")
], ["id", "value"])

labelmodel = StringIndexerModel.load(DICT_HDFSPATH + 'user_modelMake.encode')
labelmodel.transform(df1).show()
labelmodel.transform(df2).show()
labelmodel.transform(df3).show()
labelmodel.transform(df4).show()
+---+------+
| id| value|
+---+------+
|  2|iphone|
| 11|  小米|
| 22|huawei|
| 33| a锤子|
| 66|  小米|
| 50|iphone|
+---+------+

+---+------+-----------+
| id| value|valueEncode|
+---+------+-----------+
|  2|iphone|        1.0|
| 11|  小米|        0.0|
| 22|huawei|        2.0|
| 33| a锤子|        3.0|
| 66|  小米|        0.0|
| 50|iphone|        1.0|
+---+------+-----------+

+---+------+-----------+
| id| value|valueEncode|
+---+------+-----------+
| 12|iphone|        1.0|
+---+------+-----------+

+---+-----+-----------+
| id|value|valueEncode|
+---+-----+-----------+
| 22| 鸿蒙|        4.0|
+---+-----+-----------+

+---+----------+-----------+
| id|     value|valueEncode|
+---+----------+-----------+
| 22|鸿蒙hos123|        4.0|
+---+----------+-----------+

+---+----------+-----------+
| id|     value|valueEncode|
+---+----------+-----------+
| 22|鸿蒙hos123|        4.0|
| 22|   鸿蒙hos|        4.0|
| 12|    iphone|        1.0|
| 11|      小米|        0.0|
+---+----------+-----------+

归一化代码

def normalizedDF(df, inputColumn, outputColumn, savePath, minScope=0.0, maxScope=1.0, flag=True):
    '''
    归一化
    :param df: 数据框,对某列或某几列进行归一化
    :param minScope: 归一化范围
    :param maxScope: 归一化范围,默认[0, 1]
    :param inputColumn: 待转换列名
    :param outputColumn: 换换后列名
    :param savePath: 编码器保存路径
    :param flag: 是否保存
    :return:
    '''
    # 向量化
    vacal = VectorAssembler(handleInvalid="keep").setInputCols([inputColumn]).setOutputCol(inputColumn + '_vec')
    mmscala = MinMaxScalerSpark(min=minScope, max=maxScope, inputCol=inputColumn + '_vec', outputCol=outputColumn)
    pipeline = Pipeline(stages=[vacal, mmscala])
    model = pipeline.fit(df)
    mmsdf = model.transform(df)
    udf_mm = fn.udf(lambda x: float(x[0]))
    mmsdf = mmsdf.withColumn(outputColumn, udf_mm(fn.col(outputColumn)))

    if flag:
        model.write().overwrite().save(savePath)
    return mmsdf
df = ss.createDataFrame([(2, 98),(11, 100),(22, 3),(33, 1),(66, 60),(50, 20)], ["id", "value"])
df.show()

df = normalizedDF(df, 'value', 'value_nor', DICT_HDFSPATH + 'value.normalized', 0.0, 10.0, True)
df.show()
mmmodel = PipelineModel.load(DICT_HDFSPATH + 'value.normalized')
udf_mm = fn.udf(lambda x: float(x[0]))

df1 = ss.createDataFrame([
   (12, 30),
], ["id", "value"])
df1.show()

df2 = ss.createDataFrame([
   (12, 300),
   (12, 30),
   (12, 12),
   (12, 8),
], ["id", "value"])
df2.show()

df1 = mmmodel.transform(df1)
df1.withColumn('value_nor', udf_mm(fn.col('value_nor'))).show()
df2 = mmmodel.transform(df2)
df2.withColumn('value_nor', udf_mm(fn.col('value_nor'))).show()
+---+-----+
| id|value|
+---+-----+
|  2|   98|
| 11|  100|
| 22|    3|
| 33|    1|
| 66|   60|
| 50|   20|
+---+-----+

+---+-----+---------+-------------------+
| id|value|value_vec|          value_nor|
+---+-----+---------+-------------------+
|  2|   98|   [98.0]|  9.797979797979798|
| 11|  100|  [100.0]|               10.0|
| 22|    3|    [3.0]|0.20202020202020204|
| 33|    1|    [1.0]|                0.0|
| 66|   60|   [60.0]|  5.959595959595959|
| 50|   20|   [20.0]| 1.9191919191919191|
+---+-----+---------+-------------------+

+---+-----+
| id|value|
+---+-----+
| 12|   30|
+---+-----+

+---+-----+
| id|value|
+---+-----+
| 12|  300|
| 12|   30|
| 12|   12|
| 12|    8|
+---+-----+

+---+-----+---------+-----------------+
| id|value|value_vec|        value_nor|
+---+-----+---------+-----------------+
| 12|   30|   [30.0]|2.929292929292929|
+---+-----+---------+-----------------+

+---+-----+---------+------------------+
| id|value|value_vec|         value_nor|
+---+-----+---------+------------------+
| 12|  300|  [300.0]|30.202020202020204|
| 12|   30|   [30.0]| 2.929292929292929|
| 12|   12|   [12.0]|1.1111111111111112|
| 12|    8|    [8.0]|0.7070707070707071|
+---+-----+---------+------------------+

猜你喜欢

转载自blog.csdn.net/qq_42363032/article/details/120267930
今日推荐