Spark DataFrame 学习整理

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u010010664/article/details/88398935
import org.apache.spark.sql.{SQLContext, Row}
import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}
import org.apache.spark.sql.hive.HiveContext
import sqlContext.implicits._
import org.apache.spark.sql.functions.monotonically_increasing_id
import org.apache.spark.storage.StorageLevel

val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)

// 直接执行sql语句
val test_data = sqlContext.sql("select load_id,visits_id,cookie,first_visit_dt,if(event_id =='' or event_id is null, 'event_id_null',event_id) AS event_id,datediff(from_unixtime(unix_timestamp(dt,'yyyymmdd'),'yyyy-mm-dd'),sdate) diff_date from ad.aaa_use_leads_log where dt=20190301 and series_flag='3691' limit 5000")
// 利用collect().foreach(println) show() 打印测试数据
.collect().foreach(println)
.show()
// 聚合函数 count sum count(distinct) 排序
val pv_number = train_data.groupBy("cookie","is_leads").count()
.show()
val duration = train_data.groupBy("cookie","is_leads").sum("duration_int")
.show()
val visits_id = train_data.select("cookie","is_leads","visits_id").distinct().groupBy("cookie", "is_leads").count()
.show()
var event_data = train_data.groupBy("event_id").count().orderBy(desc("count"))
.show()
// 实现join where 
val tmp_te = test_data.select("cookie","event_id")
val tmp_te_1 = tmp_te.join(event_data, Seq("event_id"), "left").where("count is not null")
// dataFrame新增一列序号列,id唯一
val event_data_new = event_data.select("event_id").withColumn("cookie", monotonically_increasing_id)
// 巧用 regexp_replace替换全部字符串,实现dataFrame列的增加
val tmp_te_2 = tmp_te.join(event_data, Seq("event_id"), "left").where("count is null").withColumn("event_id", regexp_replace(regexp_replace(col("event_id"), ".*", ""),"","event_others"))
// 实现 union
val tmp_te_all = tmp_te_1.unionAll(tmp_te_2)
// 实现数据的 分组、透视、求和
val tmp_te = event_data_new.unionAll(tmp_te_all.select("event_id","cookie"))
tmp_tr.groupBy("cookie").pivot("event_id").sum
// 选取需要的列,进行保存数据
cookie_dict_tr.select("is_leads").write.mode(SaveMode.Append).text("/user/ad_tj/aaa/leads_project/aaa")
// 将DF转化为rdd,去掉括号再进行保存
cookie_dict_tr.rdd.map(_.toString().replaceAll("\\[","").replaceAll("\\]","")).saveAsTextFile("/user/ad_tj/aaa/leads_project/aaa5")






猜你喜欢

转载自blog.csdn.net/u010010664/article/details/88398935