示例
采集Kafka中数据
实现Spark Streaming类
import org.apache.kafka.clients.consumer.{
ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{
Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{
DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{
ConsumerStrategies, KafkaUtils, LocationStrategies}
object SparkWindoowDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("SparkSreamKafkaSource").setMaster("local[*]")
val context = new StreamingContext(conf, Seconds(2))
//存放临时数据目录(默认当前项目路径下)
context.checkpoint("in")
//配置Kafka信息
val kafkaParms: Map[String, String] = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.**.**:9092"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG -> "kafkaGroup1")
)
val kafkaStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
context,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkKafkaDemo"), kafkaParms)
)
//设置窗口长度10秒
val numStream: DStream[(String,Int)] = kafkaStream.flatMap(x=>x.value().toString.split("\\s+")).map((_,1)).window(Seconds(10))
//设置窗口长度10秒,窗口间隔5秒
//val numStream: DStream[(String,Int)] = kafkaStream.flatMap(x=>x.value().toString.split("\\s+")).map((_,1)).window(Seconds(10),Seconds(6))
//使用计数窗口,窗口长度10秒,窗口间隔5秒
//val numStream: DStream[Long] = kafkaStream.flatMap(x=>x.value().toString.split("\\s+")).map((_,1)).countByWindow(Seconds(10),Seconds(6))
//仅对相同key的值进行计数,窗口长度10秒,窗口间隔6秒
//val numStream: DStream[(String, Long)] = kafkaStream.flatMap(x => x.value().toString.split("\\s+")).countByValueAndWindow(Seconds(10), Seconds(6))
//输出数据
numStream.print()
//开始采集
context.start()
context.awaitTermination()
}
}
创建Kafka Topic
kafka-topics.sh --create --zookeeper 192.168.**.**:2181 --topic sparkKafkaDemo --partitions 1 --replication-factor 1
创建Kafka 生产者
kafka-console-producer.sh --topic sparkKafkaDemo --broker-list 192.168.**.**:9092
启动Spark Streaming
生产者输入数据
hello world
Spark Streaming输出数据