概述
利用hbase保存kafka消息的offset,sparkstreaming消费kafka时每次从上一次消费的offset偏移量开始消费,消费逻辑处理后又更新存储的offset, 这样就彻底避免了程序异常导致消息丢失的问题
工具类
import org.apache.hadoop.hbase.HBaseConfiguration
import java.util.HashMap
import org.apache.hadoop.hbase.client.{
ConnectionFactory, Get, Put, Table}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{
ConsumerStrategies, KafkaUtils, LocationStrategies, OffsetRange}
object OffsetHbaseUtil {
/** 获取hbase连接 */
def getConnection(zkhosts_hbase: String) = {
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum", zkhosts_hbase)
val connection = ConnectionFactory.createConnection(hbaseConf)
connection
}
/**
* 创建 DirectStream
*/
def createStreamingContextHbase(ssc: StreamingContext,
topics: Array[String],
kafkaParams: Map[String, Object],
table: Table): InputDStream[ConsumerRecord[String, String]] = {
var kafkaStreams: InputDStream[ConsumerRecord[String, String]] = null
val offSets = getOffset(topics, table)
if (offSets.nonEmpty) {
kafkaStreams = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(topics, kafkaParams, offSets))
} else {
kafkaStreams = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(topics, kafkaParams))
}
kafkaStreams
}
/**
* 得到历史 OffSet
*/
def getOffset(topics: Array[String], table: Table) = {
var fromOffSets = scala.collection.mutable.LinkedHashMap[TopicPartition, Long]()
for (i <- topics.indices) {
val topic = topics(i)
val get = new Get((s"${
topic}_offset").getBytes)
val listget = table.get(get)
if (listget.getRow != null) {
var top: String = null
var partitions: String = null
if (listget.getValue("topicinfo".getBytes, "topic".getBytes) != null && listget.getValue("topicinfo".getBytes, "partition".getBytes) != null)
top = new String(listget.getValue("topicinfo".getBytes, "topic".getBytes))
partitions = new String(listget.getValue("topicinfo".getBytes, "partition".getBytes))
val o = partitions.split(",")
for (x <- 0 until (o.length)) {
val pt = o(x).split("\\|")
fromOffSets.put(new TopicPartition(top, Integer.parseInt(pt(0))), String.valueOf(pt(1)).toLong)
}
}
}
fromOffSets
}
/**
* 保存新的 OffSet
*/
def storeOffSet(ranges: Array[OffsetRange], topic: Array[String], table: Table) = {
var map = new HashMap[String, String]
ranges.map {
x => (x.topic, x.partition.+("|" + x.untilOffset)) }.map {
case (x) => {
if (map.get(x._1) != null) map.put(x._1, map.get(x._1) + "," + x._2)
else map.put(x._1, x._2)
}
}
val arr = map.keySet().toArray()
for (index <- 0 until (arr.length)) {
// val put = new Put((s"${topic}_offset").getBytes)
val topic = arr(index)
val put = new Put((s"${
topic}_offset").getBytes)
put.addColumn("topicinfo".getBytes, "topic".getBytes, arr(index).toString() getBytes)
put.addColumn("topicinfo".getBytes, "partition".getBytes, map.get(arr(index)).toString().getBytes)
table.put(put)
}
}
}
案例
object Event2CKStreamJob {
def compute(spark: SparkSession, args: Array[String]): Unit = {
//设置参数提交长度
if (args.length < 15) {
System.err.println(
"""
<master>
<jobid>
ckHost,
ckPort,
dbName,
ckUser,
ckPasswd,
zkQuorum,
bootstrapServers,
consumerGroupID,
topics_input,
batchDuration,
autoOffsetReset,
maxRatePerPartition,
htable
""".stripMargin)
sys.exit(1)
}
//参数传入
println("params=>" + args.mkString(" "))
val Array(master, jobId,
ckHost,
ckPort,
dbName,
ckUser,
ckPasswd,
zkQuorum,
bootstrapServers,
consumerGroupID,
topics_input,
batchDuration,
autoOffsetReset,
maxRatePerPartition,
htable
) = args
println(zkQuorum)
spark.sparkContext.getConf.set("spark.streaming.kafka.maxRatePerPartition", maxRatePerPartition)
spark.sparkContext.getConf.set("spark.streaming.stopGracefullyOnShutdown", "true")
val ssc = new StreamingContext(spark.sparkContext, Seconds(batchDuration.toLong))
// ssc.checkpoint("/tmp/" + System.currentTimeMillis())
// val ssc = new StreamingContext(sparkConf, Milliseconds(millisecondsStr.toLong))
val kafkaParams = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers,
ConsumerConfig.GROUP_ID_CONFIG -> consumerGroupID,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
"auto.offset.reset" -> autoOffsetReset,
"session.timeout.ms" -> "30000",
"heartbeat.interval.ms" -> "10000",
"fetch.max.wait.ms" -> "3000",
"fetch.min.bytes" -> "4194304",
"max.poll.interval.ms" -> "300000",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
kafkaParams.foreach(x => {
println(x._1 + ":" + x._2)
})
val topicsSet = topics_input.split(",").toSet
val topics = topicsSet.toArray
val topic = topics(0)
var kafkaStreams: InputDStream[ConsumerRecord[String, String]] = null
// kafkaStreams = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
// ConsumerStrategies.Subscribe(topicsSet, kafkaParams))
val table = OffsetHbaseUtil.getConnection(zkQuorum).getTable(TableName.valueOf(htable))
kafkaStreams = OffsetHbaseUtil.createStreamingContextHbase(ssc, topics, kafkaParams, table)
kafkaStreams.foreachRDD((rdd, batchTime) => {
import spark.implicits._
val filterDS = getFilterDS(spark, rdd.map(line => {
line.value()
}).toDS())
val startEventDS = getStartEventDS(spark, filterDS)
val msgReadEventDS = getMsgReadEventDS(spark, filterDS)
val msgAckEventDS = getMsgAckEventDS(spark, filterDS)
val jdbcUrl = s"jdbc:clickhouse://$ckHost:$ckPort/$dbName"
val prop = getCKJdbcProperties(ckUser, ckPasswd)
// startEventDS.cache()
// startEventDS.show()
startEventDS.write
.mode(SaveMode.Append)
.jdbc(jdbcUrl, "event_start", prop)
// msgReadEventDS.cache()
// msgReadEventDS.show()
msgReadEventDS.write
.mode(SaveMode.Append)
.jdbc(jdbcUrl, "event_msg_read", prop)
// msgAckEventDS.cache()
// msgAckEventDS.show()
msgAckEventDS.write
.mode(SaveMode.Append)
.jdbc(jdbcUrl, "event_msg_ack", prop)
// 保存新的 Offset
OffsetHbaseUtil.storeOffSet(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, topics, table)
})
ssc.start()
ssc.awaitTermination()
}
hbase表内容
hbase(main):002:0> scan 'kafka_offSet'
ROW COLUMN+CELL
EventReportTopic_offset column=topicinfo:partition, timestamp=1675072501550, value=0|473609798,1|473518108,2|472762027
EventReportTopic_offset column=topicinfo:topic, timestamp=1675072501550, value=EventReportTopic
PubMsgTopic_offset column=topicinfo:partition, timestamp=1675072500591, value=0|276,1|276,2|266
PubMsgTopic_offset column=topicinfo:topic, timestamp=1675072500591, value=PubMsgTopic
参考资料
https://blog.cloudera.com/offset-management-for-apache-kafka-with-apache-spark-streaming/