需求理解
需求:修改suuid编码方式
- 如上图所示,根据
曝光数
、点击数
二次降序,以曝光数为分组,来进行区间编码(分箱)
- 某一组曝光数下,理想状态有3种情况:点击数>=3、点击数<3&≠0、点击数=0
- 模型从学习个性化用户行为,变为学习某一类用户行为,以此来解决ML中对suuid编码后量纲过大的影响。
受外在影响只能做ML,所以忽略emb;
hash会存在hash冲突,所以排除;
举例理解
'''
pvs clicks section_map bins
10 8 0 1
10 5 0 1
10 3 0 1
10 2 1 2
10 1 1 2
10 0 2 3
5 3 3 4
5 0 5 5
2 0 8 6
'''
曝光数:pvs;
点击数:clicks;
- 设pvs分组数为n,则理想情况(每组pvs都有3种),此时每组pvs的区间数为:【3n 3n+1 3n+2】
- n为pvs分组后索引;区间符合【3n 3n+1 3n+2】;
- 点击数∈(+,3]设为0;点击数∈(3, 0)设为1;点击数=0设为2
- pvs = 10 n=0; 区间 【0 1 2】; clicks 转为012 是【0 0 0 1 1 2】; 对应的区间是 【0 0 0 1 1 2】
- pvs = 5 n=1; 区间 【3 4 5】; clicks 转为012 是【0 2】; 对应的区间是 【3 5】
- pvs = 2 n=2; 区间 【6 7 8】; clicks 转为012 是【2】; 对应的区间是 【8】
- 即上述示例的
section_map
- 将理想情况下的区间数变为 连续不断开 即为最后的分箱区间数
bins
- 根据曝光数将分箱数和suuid做映射,即可得到suuid分区间编码数。
假数据
# 曝光降序、点击降序、等宽分区间
di = [{
'suuid': 'DONEW1', 'oaid': '000-12', 'y': 0}, {
'suuid': 'DONEW1', 'oaid': '000-12', 'y': 0},
{
'suuid': 'DONEW1', 'oaid': '000-12', 'y': 0}, {
'suuid': 'DONEW1', 'oaid': '000-12', 'y': 0},
{
'suuid': 'DONEW1', 'oaid': '000-12', 'y': 0}, {
'suuid': 'DONEW1', 'oaid': '000-12', 'y': 0},
{
'suuid': 'DONEW1', 'oaid': '000-12', 'y': 0}, {
'suuid': 'DONEW1', 'oaid': '000-12', 'y': 0},
{
'suuid': 'DONEW1', 'oaid': '000-12', 'y': 1}, {
'suuid': 'DONEW1', 'oaid': '000-12', 'y': 1},
{
'suuid': 'DONEW1', 'oaid': '000-12', 'y': 1}, {
'suuid': 'DONEW1', 'oaid': '000-12', 'y': 1},
{
'suuid': 'DONEW1', 'oaid': '000-12', 'y': 1}, {
'suuid': 'DONEW1', 'oaid': '000-12', 'y': 1},
{
'suuid': '2DONEW2', 'oaid': '000-12', 'y': 0}, {
'suuid': '2DONEW2', 'oaid': '000-12', 'y': 0},
{
'suuid': '2DONEW2', 'oaid': '000-12', 'y': 0}, {
'suuid': '2DONEW2', 'oaid': '000-12', 'y': 0},
{
'suuid': '2DONEW2', 'oaid': '000-12', 'y': 0}, {
'suuid': '2DONEW2', 'oaid': '000-12', 'y': 0},
{
'suuid': '2DONEW2', 'oaid': '000-12', 'y': 1}, {
'suuid': '2DONEW2', 'oaid': '000-12', 'y': 1},
{
'suuid': '2DONEW2', 'oaid': '000-12', 'y': 1}, {
'suuid': '2DONEW2', 'oaid': '000-12', 'y': 1},
{
'suuid': '3DONEW3', 'oaid': '000-12', 'y': 0}, {
'suuid': '3DONEW3', 'oaid': '000-12', 'y': 0},
{
'suuid': '3DONEW3', 'oaid': '000-12', 'y': 0}, {
'suuid': '3DONEW3', 'oaid': '000-12', 'y': 0},
{
'suuid': '3DONEW3', 'oaid': '000-12', 'y': 0}, {
'suuid': '3DONEW3', 'oaid': '000-12', 'y': 0},
{
'suuid': '3DONEW3', 'oaid': '000-12', 'y': 1}, {
'suuid': '3DONEW3', 'oaid': '000-12', 'y': 1},
{
'suuid': '4DONEW4', 'oaid': '000-12', 'y': 0}, {
'suuid': '4DONEW4', 'oaid': '000-12', 'y': 0},
{
'suuid': '5DONEW5', 'oaid': '000-12', 'y': 0}, {
'suuid': '5DONEW5', 'oaid': '000-12', 'y': 0},
{
'suuid': '5DONEW5', 'oaid': '000-12', 'y': 1}, {
'suuid': '6DONEW6', 'oaid': '000-12', 'y': 1},
{
'suuid': '6DONEW6', 'oaid': '000-12', 'y': 0}, {
'suuid': '6DONEW6', 'oaid': '000-12', 'y': 0},
{
'suuid': '6DONEW6', 'oaid': '000-12', 'y': 1},
]
df = ss.createDataFrame(di)
df.show()
+------+-------+---+
| oaid| suuid| y|
+------+-------+---+
|000-12| DONEW1| 0|
|000-12| DONEW1| 0|
|000-12| DONEW1| 0|
|000-12| DONEW1| 0|
|000-12| DONEW1| 0|
|000-12| DONEW1| 0|
|000-12| DONEW1| 0|
|000-12| DONEW1| 0|
|000-12| DONEW1| 1|
|000-12| DONEW1| 1|
|000-12| DONEW1| 1|
|000-12| DONEW1| 1|
|000-12| DONEW1| 1|
|000-12| DONEW1| 1|
|000-12|2DONEW2| 0|
|000-12|2DONEW2| 0|
|000-12|2DONEW2| 0|
|000-12|2DONEW2| 0|
|000-12|2DONEW2| 0|
|000-12|2DONEW2| 0|
+------+-------+---+
only showing top 20 rows
示例代码
'''
pvs clicks n section section_map bins
8 6 0 [0 1 2] 0 1
6 4 1 [3 4 5] 3 2
6 2 1 [3 4 5] 4 3
2 2 2 [6 7 8] 7 4
2 1 2 [6 7 8] 7 4
2 0 2 [6 7 8] 8 5
'''
- pvs=8 n=0; 区间 【0 1 2】; clicks 转为012 是【0】; 对应的区间是 【0】
- pvs=6 n=1; 区间 【3 4 5】; clicks 转为012 是【0 1】; 对应的区间是 【3 4】
- pvs=2 n=2; 区间 【6 7 8】; clicks 转为012 是【1 1 0】; 对应的区间是 【7 7 8】
''' 针对suuid的区间编码(基于曝光数、点击数)'''
def section_encode(df, uid, path=''):
# 计算曝光数、点击数
dfuid = df.groupBy(uid).agg(fn.collect_list('y').alias('y'))\
.rdd.map(row_count).toDF(schema=[uid, 'pvs', 'clicks'])
# (曝光降序、点击降序),注意因为分布式计算后续转换算子会有乱序的情况
dfuid = dfuid.orderBy(['pvs', 'clicks'], ascending=[0, 0])
print('----------show 1----------')
dfuid.show()
# 区间编码(根据曝光数,每组曝光数分为: 曝光有点击>=3、曝光有点击<3、曝光没点击)
## 1.增加 n
dfuid = add_correspond_index(df=dfuid, name='pvs', tmpname='n', ascending=0)
dfuid = dfuid.orderBy(['pvs', 'clicks'], ascending=[0, 0])
print('----------show 2----------')
dfuid.show()
## 2.计算 section_map
dfsection = dfuid.groupBy('pvs').agg(fn.collect_list('clicks').alias('clicks'),\
fn.collect_list('n').alias('n'))\
.rdd.map(row_section_map).toDF(schema=['pvs', 'clicks', 'section_map'])
print('----------show 3----------')
dfsection.show(truncate=False)
## 行转列
dfs = dfsection.withColumn('section_map', fn.explode(fn.split(dfsection.section_map, ',')))\
.select(['pvs', 'section_map']).withColumn('tmpid', fn.monotonically_increasing_id())
dfc = dfsection.withColumn('clicks', fn.explode(fn.split(dfsection.clicks, ',')))\
.select(['pvs', 'clicks']).withColumn('tmpid', fn.monotonically_increasing_id())
dfsection = dfc.join(dfs, on=['pvs', 'tmpid'], how='left').drop('tmpid').orderBy(['section_map'], ascending=[1])
print('----------show 4----------')
dfsection.show()
## 3.计算 bins
dfsection = add_correspond_index(df=dfsection, name='section_map', tmpname='bins', ascending=1)
dfsection = dfsection.withColumn('bins', fn.udf(lambda x: x+1)(fn.col('bins')))
print('----------show 5----------')
dfsection.show()
# 避免join的shuffle,保存 {pvs_clicks: bins}
pc_bin = {
}
dfsection = dfsection.withColumn("pvs", fn.col("pvs").cast(StringType())).withColumn("clicks", fn.col("clicks").cast(StringType()))
dfuid = dfuid.withColumn("pvs", fn.col("pvs").cast(StringType())).withColumn("clicks", fn.col("clicks").cast(StringType()))
for row in dfsection.select(['pvs', 'clicks', 'bins']).collect():
pc_bin[row[0] + '_' + row[1]] = row[2]
print(pc_bin)
dfuid = dfuid.withColumn('bins', fn.udf(lambda x, y, z: pc_bin[x + y + z])(fn.col('pvs'), fn.lit("_"), fn.col("clicks")))
print('----------show 6----------')
dfuid.orderBy(['pvs', 'clicks'], ascending=[0, 0]).show()
dfuid = dfuid.drop(*['pvs', 'clicks', 'n', 'section_map'])
dfuid = dfuid.withColumn("bins", fn.col("bins").cast(IntegerType()))
# 保存区间 {uid: bins}
uid_bin = {
}
for row in dfuid.select([uid, 'bins']).collect():
uid_bin[row[0]] = row[1]
print(uid_bin)
return dfuid, uid_bin
dfbins, uid_bin = section_encode(df=df, uid='suuid')
dfbins.show()
# join会存在shuffle导致数据倾斜
df = df.join(dfbins, on=['suuid'], how='left')
# udf方式不会
uid_bin = sc.broadcast(uid_bin)
df = df.withColumn('bins_', fn.udf(lambda x: uid_bin.value[x])(fn.col('suuid')))
----------show 1----------
+-------+---+------+
| suuid|pvs|clicks|
+-------+---+------+
| DONEW1| 8| 6|
|2DONEW2| 6| 4|
|3DONEW3| 6| 2|
|6DONEW6| 2| 2|
|5DONEW5| 2| 1|
|4DONEW4| 2| 0|
+-------+---+------+
----------show 2----------
+---+-------+------+---+
|pvs| suuid|clicks| n|
+---+-------+------+---+
| 8| DONEW1| 6| 0|
| 6|2DONEW2| 4| 1|
| 6|3DONEW3| 2| 1|
| 2|6DONEW6| 2| 2|
| 2|5DONEW5| 1| 2|
| 2|4DONEW4| 0| 2|
+---+-------+------+---+
----------show 3----------
+---+------+-----------+
|pvs|clicks|section_map|
+---+------+-----------+
|6 |4,2 |3,4 |
|8 |6 |0 |
|2 |2,1,0 |7,7,8 |
+---+------+-----------+
----------show 4----------
+---+------+-----------+
|pvs|clicks|section_map|
+---+------+-----------+
| 8| 6| 0|
| 6| 4| 3|
| 6| 2| 4|
| 2| 2| 7|
| 2| 1| 7|
| 2| 0| 8|
+---+------+-----------+
----------show 5----------
+-----------+---+------+----+
|section_map|pvs|clicks|bins|
+-----------+---+------+----+
| 0| 8| 6| 1|
| 3| 6| 4| 2|
| 4| 6| 2| 3|
| 7| 2| 2| 4|
| 7| 2| 1| 4|
| 8| 2| 0| 5|
+-----------+---+------+----+
{'8_6': '1', '6_4': '2', '6_2': '3', '2_2': '4', '2_1': '4', '2_0': '5'}
----------show 6----------
+---+-------+------+---+----+
|pvs| suuid|clicks| n|bins|
+---+-------+------+---+----+
| 8| DONEW1| 6| 0| 1|
| 6|2DONEW2| 4| 1| 2|
| 6|3DONEW3| 2| 1| 3|
| 2|6DONEW6| 2| 2| 4|
| 2|5DONEW5| 1| 2| 4|
| 2|4DONEW4| 0| 2| 5|
+---+-------+------+---+----+
{'DONEW1': 1, '2DONEW2': 2, '3DONEW3': 3, '6DONEW6': 4, '5DONEW5': 4, '4DONEW4': 5}
+-------+----+
| suuid|bins|
+-------+----+
| DONEW1| 1|
|2DONEW2| 2|
|3DONEW3| 3|
|6DONEW6| 4|
|5DONEW5| 4|
|4DONEW4| 5|
+-------+----+
# suuid的曝光数、点击数
def row_count(row):
uid, y = row[0], row[1]
clicks = sum(y)
pvs = len(y) - clicks
return uid, pvs, clicks
# 增加对应的自增索引
def add_correspond_index(df, name, tmpname, ascending=0):
dfjon = df.select(name).drop_duplicates(subset=[name]).orderBy([name], ascending=[ascending])
dfjon = mkdf_tojoin(dfjon, tmpname)
df = df.join(dfjon, on=[name], how='left').orderBy([tmpname], ascending=[1])
return df
# section_map
def row_section_map(row):
pvs, clicks, n = row[0], row[1], row[2][0]
section = [3*n, 3*n+1, 3*n+2]
section_map = []
for c in clicks:
if c >= 3:
section_map.append(section[0])
elif c < 3 and c != 0:
section_map.append(section[1])
else:
section_map.append(section[2])
return pvs, ','.join(str(i) for i in clicks), ','.join(str(i) for i in section_map)
def flat(l):
for k in l:
if not isinstance(k, (list, tuple)):
yield k
else:
yield from flat(k)
def mkdf_tojoin(df, idname):
schema = df.schema.add(StructField(idname, LongType()))
rdd = df.rdd.zipWithIndex()
rdd = rdd.map(lambda x: list(flat(x)))
df = ss.createDataFrame(rdd, schema)
return df
df.show(100)
+-------+------+---+----+-----+
| suuid| oaid| y|bins|bins_|
+-------+------+---+----+-----+
|4DONEW4|000-12| 0| 5| 5|
|4DONEW4|000-12| 0| 5| 5|
|3DONEW3|000-12| 0| 3| 3|
|3DONEW3|000-12| 0| 3| 3|
|3DONEW3|000-12| 0| 3| 3|
|3DONEW3|000-12| 0| 3| 3|
|3DONEW3|000-12| 0| 3| 3|
|3DONEW3|000-12| 0| 3| 3|
|3DONEW3|000-12| 1| 3| 3|
|3DONEW3|000-12| 1| 3| 3|
|2DONEW2|000-12| 0| 2| 2|
|2DONEW2|000-12| 0| 2| 2|
|2DONEW2|000-12| 0| 2| 2|
|2DONEW2|000-12| 0| 2| 2|
|2DONEW2|000-12| 0| 2| 2|
|2DONEW2|000-12| 0| 2| 2|
|2DONEW2|000-12| 1| 2| 2|
|2DONEW2|000-12| 1| 2| 2|
|2DONEW2|000-12| 1| 2| 2|
|2DONEW2|000-12| 1| 2| 2|
|6DONEW6|000-12| 1| 4| 4|
|6DONEW6|000-12| 0| 4| 4|
|6DONEW6|000-12| 0| 4| 4|
|6DONEW6|000-12| 1| 4| 4|
| DONEW1|000-12| 0| 1| 1|
| DONEW1|000-12| 0| 1| 1|
| DONEW1|000-12| 0| 1| 1|
| DONEW1|000-12| 0| 1| 1|
| DONEW1|000-12| 0| 1| 1|
| DONEW1|000-12| 0| 1| 1|
| DONEW1|000-12| 0| 1| 1|
| DONEW1|000-12| 0| 1| 1|
| DONEW1|000-12| 1| 1| 1|
| DONEW1|000-12| 1| 1| 1|
| DONEW1|000-12| 1| 1| 1|
| DONEW1|000-12| 1| 1| 1|
| DONEW1|000-12| 1| 1| 1|
| DONEW1|000-12| 1| 1| 1|
|5DONEW5|000-12| 0| 4| 4|
|5DONEW5|000-12| 0| 4| 4|
|5DONEW5|000-12| 1| 4| 4|
+-------+------+---+----+-----+