Spark Streaming 生产数据到 Kafka(windows 环境下)

环境介绍

本文是在 windows 环境下的一次练习。

  • Jdk 版本, 1.8
  • zookeeper,3.4.14
  • kafka 版本,2.11-2.3.1
  • 使用语言:java
    在这里插入图片描述

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
        <artifactId>spark</artifactId>
        <groupId>org.feng</groupId>
        <version>1.0-SNAPSHOT</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>

    <artifactId>spark-kafka</artifactId>

    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.4.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
            <version>2.4.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-pool2</artifactId>
            <version>2.6.2</version>
        </dependency>
    </dependencies>
</project>

KafkaProducerFactory

package org.feng.factory;

import org.apache.kafka.clients.producer.KafkaProducer;
import org.feng.proxy.KafkaProducerProxy;

import java.util.Properties;

/**
 * Created by Feng on 2019/12/3 9:34
 * CurrentProject's name is spark
 * kafka生产者工厂类:
 * 可以获得kafka生产者对象和kafka生产者代理对象
 * @author Feng
 */
public class KafkaProducerFactory {

    private KafkaProducerFactory(){
    }

    /**
     * 创建一个kafka生产者对象
     * @param topic 指定topic
     * @param brokerList 指定集群信息
     * @return kafka生产者
     */
    public static KafkaProducer<String, Object> newInstance(String topic, String brokerList){

        Properties properties = new Properties();

        properties.put("bootstrap.servers", brokerList);
        properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");

        return new KafkaProducer<>(properties);
    }


    /**
     * 创建一个kafka生产者对象的代理类
     * @param topic 指定topic
     * @param brokerList 指定brokerList
     * @return org.feng.proxy.KafkaProducerProxy
     */
    public static KafkaProducerProxy newProducerPoxy(String topic, String brokerList) {
        return new KafkaProducerProxy(topic, brokerList);
    }

    /**
     * 创建一个kafka生产者对象的代理类:
     * 使用默认的topic和brokerList
     * @return org.feng.proxy.KafkaProducerProxy
     */
    public static KafkaProducerProxy newProducerPoxy() {
        return new KafkaProducerProxy();
    }
}

CreateKafkaProducer

package org.feng.poll;

import org.apache.commons.pool2.impl.GenericObjectPool;
import org.apache.commons.pool2.impl.GenericObjectPoolConfig;
import org.feng.proxy.KafkaProducerProxy;

/**
 * Created by Feng on 2019/12/3 14:12
 * CurrentProject's name is spark
 * @author Feng
 */
public class CreateKafkaProducer {
    public GenericObjectPool<KafkaProducerProxy> apply(){

        KafkaProducerPool pool = new KafkaProducerPool();
        GenericObjectPoolConfig<KafkaProducerProxy> config = new GenericObjectPoolConfig<>();
        config.setMaxIdle(10);
        config.setMaxTotal(10);

        return new GenericObjectPool<>(pool, config);
    }
}

KafkaProducerPool

package org.feng.poll;

import org.apache.commons.pool2.BasePooledObjectFactory;
import org.apache.commons.pool2.PooledObject;
import org.apache.commons.pool2.impl.DefaultPooledObject;
import org.feng.factory.KafkaProducerFactory;
import org.feng.proxy.KafkaProducerProxy;

/**
 * Created by Feng on 2019/12/3 14:06
 * CurrentProject's name is spark
 * kafka生产者连接池:池子中放的是kafka生产者的代理对象
 * @author Feng
 */
public class KafkaProducerPool extends BasePooledObjectFactory<KafkaProducerProxy> {

    /**
     * 连接池创建对象
     * @return org.feng.proxy.KafkaProducerProxy
     */
    @Override
    public KafkaProducerProxy create() {
        return KafkaProducerFactory.newProducerPoxy();
    }

    /**
     * 连接池包装对象
     * @param kafkaProducerProxy kafka生产者代理对象;包装的目标
     * @return 连接池
     */
    @Override
    public PooledObject<KafkaProducerProxy> wrap(KafkaProducerProxy kafkaProducerProxy) {
        return new DefaultPooledObject<>(kafkaProducerProxy);
    }
}

KafkaProducerProxy

package org.feng.proxy;

import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.feng.factory.KafkaProducerFactory;

import javax.validation.constraints.NotNull;
/**
 * Created by Feng on 2019/12/3 9:40
 * CurrentProject's name is spark
 * kafka生产者的代理类。
 * @author Feng
 */
public class KafkaProducerProxy {
    private KafkaProducer<String, Object> kafkaProducer;
    private String defaultTopic;

    /**
     * 默认构造器:
     * defaultTopic="feng2020"
     * brokerList="localhost:9092"
     */
    public KafkaProducerProxy(){
        this("feng2020", "localhost:9092");
    }

    /**
     * 有参构造器:指定topic和brokerList来创建kafka的生产者
     * @param topic 指定topic
     * @param brokerList 指定brokerList
     */
    public KafkaProducerProxy(String topic, String brokerList){
        defaultTopic = topic;
        kafkaProducer = KafkaProducerFactory.newInstance(topic,
                brokerList);
    }

    private void send (@NotNull String topic, String key, Object value){
        kafkaProducer.send(toRecord(topic, key, value));
    }

    /**
     * 给默认主题发送信息
     * @param value 发送内容
     */
    public void send(Object value){
        send(defaultTopic, null, value);
    }

    /**
     * 给指定主题发送信息
     * @param topic 主题
     * @param value 内容
     */
    public void send(@NotNull String topic, Object value){
        send(topic, null, value);
    }

    private ProducerRecord<String, Object> toRecord(String topic, String key, Object value){
        return new ProducerRecord<>(topic, key, value);
    }

    private ProducerRecord<String, Object> toRecord(String topic, Object value){
        return new ProducerRecord<>(topic, value);
    }

    /**
     * 关闭kafka生产者
     */
    public void shutdown(){
        if(kafkaProducer != null){
            kafkaProducer.close();
        }
    }
}

KafkaStreamingClient

package org.feng.client;

import org.apache.commons.pool2.impl.GenericObjectPool;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import org.feng.poll.CreateKafkaProducer;
import org.feng.proxy.KafkaProducerProxy;

import java.util.*;

/**
 * Created by Feng on 2019/12/3 14:18
 * CurrentProject's name is spark
 * windows 下启动kafka、创建topic见下边连接:
 * https://blog.csdn.net/summerZBH123/article/details/79817001
 * @author Feng
 */
public class KafkaStreamingClient {
    public static void main(String[] args) throws InterruptedException {
        Map<String, Object> kafkaParams = new HashMap<>(16);
        kafkaParams.put("bootstrap.servers", "localhost:9092");
        kafkaParams.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        kafkaParams.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        kafkaParams.put("group.id", "spark_kafka");
        // 自动重置偏移
        kafkaParams.put("auto.offset.reset", "latest");
        kafkaParams.put("enable.auto.commit", false);

        Collection<String> topics = Collections.singletonList("feng2020");

        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("KafkaStreamingClient");
        JavaStreamingContext javaStreamingContext = new JavaStreamingContext(conf, Durations.seconds(1));

        KafkaUtils.createDirectStream(javaStreamingContext, LocationStrategies.PreferConsistent(),
                ConsumerStrategies.Subscribe(topics, kafkaParams))
                .map(line -> ">>>" + line.value())
                .foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> rdd.foreachPartition((VoidFunction<Iterator<String>>) records -> {
                    GenericObjectPool<KafkaProducerProxy> pool = new CreateKafkaProducer().apply();
                    KafkaProducerProxy producerProxy = pool.borrowObject();

                    while(records.hasNext()){
                        String next = records.next();
                        System.out.println("next:" + next);
                        producerProxy.send("target", next);
                    }

                    // 回收连接池
                    pool.returnObject(producerProxy);
                }));
        javaStreamingContext.start();
        javaStreamingContext.awaitTermination();
    }
}

结果展示

控制台输出:
在这里插入图片描述
消费者端:
在这里插入图片描述
生产者端:
在这里插入图片描述

结果说明:
这里的生产者,我是一直开启的,消费者窗口是中间关闭过一次,因此结果只有最后2条数据。

程序中,在发送数据到 kafka 的 target 主题中时,在数据前边拼接上了 >>> 这个字符串,因此从生产者发送消息之后,在消费者的窗口中,可以看到处理之后的数据。同时,在 idea 的控制台中也输出了相应的结果,这是因为,我在程序入口方法中,发送数据之前会打印一次。

以上,证明了我的程序是跑起来了!

发布了108 篇原创文章 · 获赞 117 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/FBB360JAVA/article/details/104280249