Structured Streaming之Event-Time的Window操作

基于Event-Time的Window操作
详细工作流程：
在这里插入图片描述

方式一：

package com.kfk.spark.structuredstreaming;

import com.kfk.spark.common.CommSparkSession;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import scala.Tuple2;

import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * @author : 蔡政洁
 * @email :[email protected]
 * @date : 2020/12/22
 * @time : 9:44 下午
 */
public class EventTimeWindow {
    
    
    public static void main(String[] args) throws StreamingQueryException {
    
    
        SparkSession spark = CommSparkSession.getSparkSession();

        // input table
        Dataset<Row> dflines = spark.readStream()
                .format("socket")
                .option("host", "bigdata-pro-m04")
                .option("port", 9999)
                .option("includeTimestamp",true)
                .load();

        /**
         * 输入数据：spark storm hive
         * 接受到数据模型：
         * 第一步：words -> spark storm hive,  timestamp -> 2019-09-09 12:12:12
         *        eg: tuple -> (spark storm hive , 2019-09-09 12:12:12)
         *
         * 第二步：split()操作
         *        对tuple中的key进行split操作
         *
         * 第三步：flatMap()操作
         * 返回类型 -> list(tuple(spark,2019-09-09 12:12:12),
         *               tuple(storm,2019-09-09 12:12:12),
         *               tuple(hive,2019-09-09 12:12:12))
         */
        Dataset<Row> words =  dflines.as(Encoders.tuple(Encoders.STRING(),Encoders.TIMESTAMP()))
                .flatMap(new FlatMapFunction<Tuple2<String, Timestamp>, Tuple2<String, Timestamp>>() {
    
    
                    @Override
                    public Iterator<Tuple2<String, Timestamp>> call(Tuple2<String, Timestamp> stringTimestampTuple2) throws Exception {
    
    
                        List<Tuple2<String, Timestamp>> result = new ArrayList<>();
                        for (String word : stringTimestampTuple2._1.split(" ")){
    
    
                            result.add(new Tuple2<>(word,stringTimestampTuple2._2));
                        }
                        return result.iterator();
                    }
                },Encoders.tuple(Encoders.STRING(),Encoders.TIMESTAMP())).toDF("word","wordtime");

        // result table
        Dataset<Row> windowedCounts = words.groupBy(
                functions.window(words.col("wordtime"),
                        "10 minutes",
                        "5 minutes"),
                words.col("word")
        ).count();

        StreamingQuery query = windowedCounts.writeStream()
                .outputMode("update")
                .format("console")
                .option("truncate","false")
                .start();
        query.awaitTermination();

    }
}

方式二：

package com.kfk.spark.structuredstreaming;

import java.sql.Timestamp;

/**
 * @author : 蔡政洁
 * @email :[email protected]
 * @date : 2020/12/22
 * @time : 9:25 下午
 */
public class EventData {
    
    

    //timestamp: Timestamp,
    //word: String

    private Timestamp wordtime;
    private String word;

    public Timestamp getWordtime() {
    
    
        return wordtime;
    }

    public void setWordtime(Timestamp wordtime) {
    
    
        this.wordtime = wordtime;
    }

    public String getWord() {
    
    
        return word;
    }

    public void setWord(String word) {
    
    
        this.word = word;
    }

    public EventData() {
    
    
    }

    public EventData(Timestamp wordtime, String word) {
    
    
        this.wordtime = wordtime;
        this.word = word;
    }
}

package com.kfk.spark.structuredstreaming;

import com.kfk.spark.common.CommSparkSession;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;


import java.sql.Timestamp;


/**
 * @author : 蔡政洁
 * @email :[email protected]
 * @date : 2020/12/22
 * @time : 9:44 下午
 */
public class EventTimeWindow2 {
    
    
    public static void main(String[] args) throws StreamingQueryException {
    
    
        SparkSession spark = CommSparkSession.getSparkSession();

        // input table
        Dataset<String> dflines = spark.readStream()
                .format("socket")
                .option("host", "bigdata-pro-m04")
                .option("port", 9999)
                .load().as(Encoders.STRING());

        /**
         *  数据源：2019-03-29 16:40:00,hadoop
         *        2019-03-29 16:40:10,storm
         *
         *  // 根据javaBean转换为dataset
         */
        Dataset<EventData> dfeventdata = dflines.map(new MapFunction<String, EventData>() {
    
    
            @Override
            public EventData call(String value) throws Exception {
    
    

                String[] lines  = value.split(",");
                return new EventData(Timestamp.valueOf(lines[0]),lines[1]);
            }
        }, ExpressionEncoder.javaBean(EventData.class));

        // result table
        Dataset<Row> windowedCounts = dfeventdata.groupBy(
                functions.window(dfeventdata.col("wordtime"),
                        "10 minutes",
                        "5 minutes"),
                dfeventdata.col("word")
        ).count();

        StreamingQuery query = windowedCounts.writeStream()
                .outputMode("update")
                .format("console")
                .option("truncate","false")
                .start();
        query.awaitTermination();

    }
}

以上内容仅供参考学习，如有侵权请联系我删除！
如果这篇文章对您有帮助，左下角的大拇指就是对博主最大的鼓励。
您的鼓励就是博主最大的动力！

Structured Streaming之Event-Time的Window操作

猜你喜欢