本次在IntelliJ中使用Flink-SQL 从Kafka 队列中读取数据保存到Hudi表
场景
本机环境
Hadoop | 3.2.2 |
---|---|
Spark | 3.2.4 |
Flink | 1.16.1 |
Hive | 3.1.3 |
Hudi | 0.13.1 |
最开始pom依赖
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<scala.version>2.12.17</scala.version>
<scala.binary.version>2.12</scala.binary.version>
<flink.version>1.16.1</flink.version>
<hadoop.version>3.2.2</hadoop.version>
<hudi.version>0.13.1</hudi.version>
</properties>
<dependencies>
<!-- Flink Client-->
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-clients -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-streaming-java -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-runtime-web -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-runtime-web</artifactId>
<version>${flink.version}</version>
<scope>test</scope>
</dependency>
<!-- Flink Table API & SQL-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-table-api-java-bridge -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- FLink - Kafka -->
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-connector-kafka -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-json</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-flink1.16-bundle</artifactId>
<version>${hudi.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-shaded-hadoop-2-uber -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-shaded-hadoop-2-uber</artifactId>
<version>2.8.3-10.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.33</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.alibaba.fastjson2/fastjson2 -->
<dependency>
<groupId>com.alibaba.fastjson2</groupId>
<artifactId>fastjson2</artifactId>
<version>2.0.26</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.projectlombok/lombok -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.24</version>
<!-- <scope>provided</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>2.0.3</version>
<!-- <type>pom</type>-->
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>1.7.31</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
<scope>runtime</scope>
</dependency>
</dependencies>
无问题的FlinkSQL 代码
package cn.itcast.hudi;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import static org.apache.flink.table.api.Expressions.$;
/**
* 从Kafka 队列中取出数据
* 并通过筛选,后生成临时表
*/
public class FlinkSQLKafkaDemo {
public static void main(String[] args) {
// 1. 构建Flink 流式执行环境
EnvironmentSettings settings = EnvironmentSettings
.newInstance()
.inStreamingMode() // 流式环境
.build();
TableEnvironment tableEnvironment = TableEnvironment.create(settings);
// 2. 创建一个输入表 TODO: 从Kafka消费数据
tableEnvironment.executeSql(
"CREATE TABLE order_kafka_source (\n" +
" orderId STRING,\n" +
" userId STRING,\n" +
" orderTime STRING,\n" +
" ip STRING,\n" +
" orderMoney DOUBLE,\n" +
" orderStatus INT\n" +
") WITH (\n" +
" 'connector' = 'kafka',\n" +
" 'topic' = 'order_hudi',\n" +
" 'properties.bootstrap.servers' = 'localhost:9092',\n" +
" 'properties.group.id' = 'gid-1001',\n" +
" 'scan.startup.mode' = 'latest-offset',\n" +
" 'format' = 'json',\n" +
" 'json.fail-on-missing-field' = 'false',\n" +
" 'json.ignore-parse-errors' = 'true'\n" +
")"
);
// 3. 转化数据 : 可以使用SQL,也可以使用Table API
Table etlTable = tableEnvironment
.from("order_kafka_source")
// 添加字段 : Hudi 表分区字段,"orderTime":"2021-11-22 10:34:34.136" -> 021-11-22
// import static org.apache.flink.table.api.Expressions.*;
.addColumns(
$("orderTime").substring(0, 10).as("partition_day")
)
//添加字段:hudi 表数据合并字段,时间戳 "orderId" : "20211122103434136000001" -> 20211122103434136
.addColumns(
$("orderId").substring(0, 17).as("ts")
);
tableEnvironment.createTemporaryView("view_order", etlTable);
// 4. 结果输出 创建输出表 :TODO:将结果数据输出,
tableEnvironment.executeSql("SELECT * FROM view_order").print();
}
}
当修改第四步,使用Flink SQL,将Kafka数据保存在本地Hudi表中时报错
// 4. 结果输出 创建输出表 :TODO:关联到Hudi表,指 定Hudi表名称,存储路径,字段名称等信息
tableEnvironment.executeSql(
"CREATE TABLE order_hudi_sink (\n" +
" orderId STRING PRIMARY KEY NOT ENFORCED,\n" +
" userId STRING,\n" +
" orderTime STRING,\n" +
" ip STRING,\n" +
" orderMoney DOUBLE,\n" +
" orderStatus INT,\n" +
" ts STRING,\n" +
" partition_day STRING\n" +
")\n" +
"PARTITIONED BY (partition_day) \n" +
"WITH (\n" +
" 'connector' = 'hudi',\n" +
" 'path' = 'file:Users/kturnura/data/hudi-local-warehouse/order_hudi_sink',\n" +
" 'table.type' = 'MERGE_ON_READ',\n" +
" 'write.operation' = 'upsert',\n" +
" 'hoodie.datasource.write.recordkey.field' = 'orderId',\n" +
" 'write.precombine.field' = 'ts',\n" +
" 'write.tasks' = '4'\n" +
")"
);
//
// tableEnvironment.executeSql("select * from order_hudi_sink").print();
// 5 : 通过子查询方式,将数据写入输出表
tableEnvironment.executeSql(
"Insert into order_hudi_sink " +
"SELECT " +
"orderId, userId, orderTime,ip, orderMoney, orderStatus,ts, partition_day " +
"FROM view_order"
);
报错信息
报错一
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.fs.FSDataInputStream
在Flink SQL 命令行中时,该问题在启动Flink Cluster 集群前使用
export HADOOP_CLASSPATH=
hadoop classpath
在IntelliJ中,使用Terminal无法实现该功能
参考文献:https://blog.csdn.net/appleyuchi/article/details/108822061
添加Pom 依赖
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.2.2</version>
<!-- 此处3.2.2为本机hadoop版本-->
</dependency>
报错二
在解决报错一后,出现了老朋友
Caused by: java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument (ZLjava/lang/String;Ljava/lang/Object;)V.
灵光一现:将pom依赖降版本
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.0.0</version>
<!-- 这个版本的guava.jar 版本要低于19.0-->
</dependency>
跑通了