Hadoop中MapReduce 的Combiner 的实现

自己实现Combiner

package com.mapreduce;

import java.io.IOException;

import org.apache.hadoop.examples.SecondarySort.Reduce;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class MyCombiner extends Reducer<Text, Text, Text, Text>{

	@Override
	protected void reduce(Text key, Iterable<Text> value, Context context)
			throws IOException, InterruptedException {
		/**
		 * 接受到的数据格式
		 * key			 value
		 * hadoop_1.html   1
		 * hadoop_1.html   1
		 * 
		 * 输出的数据格式
		 * key			 value
		 * hadoop 		1.html:3
		 * 
		 */
		String string = key.toString();
		String[] split = string.split("_");
		int count = 0;
		for (Text t : value) {
			count+=Integer.parseInt(t.toString());
		}
		
		context.write(new Text(split[0]), new Text(split[1]+":"+count+""));
	}

	
}

具体实现

package com.mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
 * 输入的内容
 * 1.html
 * hadoop hadoop hadoop is good
 * 2.html 
 * hadoop hbase hbase is better is good
 * 3.html 
 * hbase hadoop hive hive is nice is good 
 * 
 * 输出的内容
 * 
 * hadoop:1.html:3,2.html:1,3.html:1
 * is:1.html:1,2.html:2,3.html:2
 * .
 * .
 * .
 * @author Administrator
 *
 */
public class DescIndexDemo implements Tool{
	
	public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
		
		/**
		 * value值代表输入的值
		 */
		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			//获取文件名字
			InputSplit is = context.getInputSplit();
			String fileName = ((FileSplit)is).getPath().getName();
			String line = value.toString();
			String words[] = line.split(" ");
			for (String word : words) {
				context.write(new Text(word+"_"+fileName), new Text(1+""));
			}
		}

	}

	/**
	 * 
	 * 输入的数据格式
	 * key			 value
	 * hadoop 		1.html:3
	 * hadoop 		2.html:1
	 * 输出的内容
	 * 
     * hadoop:1.html:3,2.html:1,3.html:1
     * is:1.html:1,2.html:2,3.html:2
	 * @author Administrator
	 *
	 */
	public static class MyReduce extends Reducer<Text, Text, Text, Text> {

		@Override
		protected void reduce(Text value, Iterable<Text> list, Context context)
				throws IOException, InterruptedException {
			String string="";
			for (Text i : list) {
				string+=i+",";
			}
			context.write(value, new Text(string.substring(0, string.length()-1)));
		}

	}

	/**
	 * 设置conf类型
	 */
	public void setConf(Configuration conf) {
		// TODO Auto-generated method stub
		conf.set("fs.defaultFS", "hdfs://zwj");
		conf.set("dfs.nameservices", "zwj");
		conf.set("dfs.ha.namenodes.zwj", "nn1,nn2");
		conf.set("dfs.namenode.rpc-address.zwj.nn1", "hadoop01:9000");
		conf.set("dfs.namenode.rpc-address.zwj.nn2", "hadoop02:9000");
		conf.set("dfs.client.failover.proxy.provider.zwj",
				"org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
	}

	public Configuration getConf() {
		// TODO Auto-generated method stub
		return new Configuration();
	}

	public int run(String[] args) throws Exception {
		// TODO Auto-generated method stub
		Configuration conf = getConf();
		Job job = Job.getInstance(conf, "job");
		job.setJarByClass(DescIndexDemo.class);
		//设置自定义mapper的值 
		job.setMapperClass(MyMapper.class);
		
		//设计Combiner
		job.setCombinerClass(MyCombiner.class);
		
		//对Map阶段输出 的key value 的类型进行赋值
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		
		
		//设置自定义Reduce的值 
		job.setReducerClass(MyReduce.class);
		//对Reduce阶段输出 的key value 的类型进行赋值
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		//设置input output 的数值，通过args[进行赋值]
		setInputAndOutput(job, conf, args);
		return  (job.waitForCompletion(true)? 0 : 1);
	}

	private void setInputAndOutput(Job job, Configuration conf, String[] args) throws Exception {
		if (args.length != 2) {
			System.out.println("数据格式不正确");
			return;
		}
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileSystem fs = FileSystem.get(conf);
		Path outPath = new Path(args[1]);
		if (fs.exists(outPath)) {
			fs.delete(outPath, true);
		}
		FileOutputFormat.setOutputPath(job, outPath);
	}
	/**
	 * 主调用函数通过这个执行方法，并且传入参数
	 * @param args
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {
		int isok = ToolRunner.run( new DescIndexDemo(), args);
		// 退出整个job
		System.exit(isok);
		
	}
}

Hadoop中MapReduce 的Combiner 的实现

猜你喜欢