手写MapReduce之combiner

数据源

在这里插入图片描述准备4个txt文件,内容不限制,我用的是英文单词,用空格进行分割!

Mapper

package com.zhengkw.combiner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @ClassName:WordcountMapper
 * @author: zhengkw
 * @description: mapper
 * @date: 20/02/24上午 8:42
 * @version:1.0
 * @since: jdk 1.8
 */
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    Text k = new Text();
    IntWritable v = new IntWritable(1);

    /**
     * @param key
     * @param value
     * @param context
     * @descrption:重写map 实现wordcount
     * @return: void
     * @date: 20/02/24 上午 8:47
     * @author: zhengkw
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1 获取一行
        String line = value.toString().trim();
        // 2 切割
        String[] words = line.split(" ");

        // 3 输出
        for (String word : words
        ) {
            k.set(word);
           //期望输出的是<hadoop,1> --> <string,int>
            context.write(k, v);
        }

    }
}

Reduce

package com.zhengkw.combiner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @ClassName:WordcountRedurce
 * @author: zhengkw
 * @description: redurce
 * @date: 20/02/24上午 8:42
 * @version:1.0
 * @since: jdk 1.8
 */
public class WordcountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
    int sum;
    IntWritable v = new IntWritable();

    /**
     * @param key
     * @param values
     * @param context
     * @descrption:
     * @return: void
     * @date: 20/02/24 上午 8:51
     * @author: zhengkw
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        // 1 累加求和
        for (IntWritable value : values
        ) {
            sum += value.get();
        }

        // 2 输出
        v.set(sum);
        context.write(key, v);

    }

}

Combiner

package com.zhengkw.combiner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @ClassName:WordCountCombiner
 * @author: zhengkw
 * @description: 提前合并
 * @date: 20/02/27上午 11:41
 * @version:1.0
 * @since: jdk 1.8
 */
public class WordCountCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {

    int sum;
    IntWritable v = new IntWritable();

    /**
     * @param key
     * @param values
     * @param context
     * @descrption:
     * @return: void
     * @date: 20/02/24 上午 8:51
     * @author: zhengkw
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        // 1 累加求和
        for (IntWritable value : values
        ) {
            sum += value.get();
        }

        // 2 输出
        v.set(sum);
        context.write(key, v);

    }
}

Driver

package com.zhengkw.combiner;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @descrption:
 * @return:
 * @date: 20/02/24 上午 8:53
 * @author: zhengkw
 */
public class WordcountDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        // 输入路径
        Path inputPath = new Path("F:\\mrinput\\combine");
        // 输出路径
        Path outputPath = new Path("f:/output3");

        Configuration conf = new Configuration();

        //判断输出路径是否已经存在 存在则删除
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }


        //用配置文件反射实例化job对象
        Job job = Job.getInstance(conf);

        // 2 设置jar加载路径
        job.setJarByClass(WordcountDriver.class);

        // 3 设置map和reduce类
        job.setMapperClass(WordcountMapper.class);
        job.setReducerClass(WordcountReduce.class);
        job.setCombinerClass(WordCountCombiner.class);
        // 4 设置map输出
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 5 设置最终输出kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 6 设置输入和输出路径
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        // 7 提交
        boolean result = job.waitForCompletion(true);

        System.exit(result ? 0 : 1);

    }
}

总结

  • Combiner执行阶段位于Shuffle溢写阶段(即reduce-shuffle中的),当reduce的业务只是加减,无乘除或者更复杂的业务可以使用Combiner!!
  • 没有reduce则没有shuffle,即没有Combiner!
  • 每次溢写都会运行Combiner!
  • 在merge的过程中,如果溢写的片段个数>=3,会再次运行combiner!
  • 设置 -----job.setCombinerClass(WordCountCombiner.class);
发布了37 篇原创文章 · 获赞 17 · 访问量 1825

猜你喜欢

转载自blog.csdn.net/qq_37714755/article/details/104739462
今日推荐