自己实现Combiner
package com.mapreduce;
import java.io.IOException;
import org.apache.hadoop.examples.SecondarySort.Reduce;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MyCombiner extends Reducer<Text, Text, Text, Text>{
@Override
protected void reduce(Text key, Iterable<Text> value, Context context)
throws IOException, InterruptedException {
/**
* 接受到的数据格式
* key value
* hadoop_1.html 1
* hadoop_1.html 1
*
* 输出的数据格式
* key value
* hadoop 1.html:3
*
*/
String string = key.toString();
String[] split = string.split("_");
int count = 0;
for (Text t : value) {
count+=Integer.parseInt(t.toString());
}
context.write(new Text(split[0]), new Text(split[1]+":"+count+""));
}
}
具体实现
package com.mapreduce;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 输入的内容
* 1.html
* hadoop hadoop hadoop is good
* 2.html
* hadoop hbase hbase is better is good
* 3.html
* hbase hadoop hive hive is nice is good
*
* 输出的内容
*
* hadoop:1.html:3,2.html:1,3.html:1
* is:1.html:1,2.html:2,3.html:2
* .
* .
* .
* @author Administrator
*
*/
public class DescIndexDemo implements Tool{
public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
/**
* value值代表输入的值
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取文件名字
InputSplit is = context.getInputSplit();
String fileName = ((FileSplit)is).getPath().getName();
String line = value.toString();
String words[] = line.split(" ");
for (String word : words) {
context.write(new Text(word+"_"+fileName), new Text(1+""));
}
}
}
/**
*
* 输入的数据格式
* key value
* hadoop 1.html:3
* hadoop 2.html:1
* 输出的内容
*
* hadoop:1.html:3,2.html:1,3.html:1
* is:1.html:1,2.html:2,3.html:2
* @author Administrator
*
*/
public static class MyReduce extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text value, Iterable<Text> list, Context context)
throws IOException, InterruptedException {
String string="";
for (Text i : list) {
string+=i+",";
}
context.write(value, new Text(string.substring(0, string.length()-1)));
}
}
/**
* 设置conf类型
*/
public void setConf(Configuration conf) {
// TODO Auto-generated method stub
conf.set("fs.defaultFS", "hdfs://zwj");
conf.set("dfs.nameservices", "zwj");
conf.set("dfs.ha.namenodes.zwj", "nn1,nn2");
conf.set("dfs.namenode.rpc-address.zwj.nn1", "hadoop01:9000");
conf.set("dfs.namenode.rpc-address.zwj.nn2", "hadoop02:9000");
conf.set("dfs.client.failover.proxy.provider.zwj",
"org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
}
public Configuration getConf() {
// TODO Auto-generated method stub
return new Configuration();
}
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
Configuration conf = getConf();
Job job = Job.getInstance(conf, "job");
job.setJarByClass(DescIndexDemo.class);
//设置自定义mapper的值
job.setMapperClass(MyMapper.class);
//设计Combiner
job.setCombinerClass(MyCombiner.class);
//对Map阶段输出 的key value 的类型进行赋值
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//设置自定义Reduce的值
job.setReducerClass(MyReduce.class);
//对Reduce阶段输出 的key value 的类型进行赋值
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//设置input output 的数值,通过args[进行赋值]
setInputAndOutput(job, conf, args);
return (job.waitForCompletion(true)? 0 : 1);
}
private void setInputAndOutput(Job job, Configuration conf, String[] args) throws Exception {
if (args.length != 2) {
System.out.println("数据格式不正确");
return;
}
FileInputFormat.addInputPath(job, new Path(args[0]));
FileSystem fs = FileSystem.get(conf);
Path outPath = new Path(args[1]);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
}
/**
* 主调用函数通过这个执行方法,并且传入参数
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
int isok = ToolRunner.run( new DescIndexDemo(), args);
// 退出整个job
System.exit(isok);
}
}