现有一个需求如下:
有三个文件 我们需要统计每个文件中的单词数量 但是格式有一定要求
最终格式:jee a.txt-->4 b.txt-->2 c.txt-->2
前面是单词中间隔一个\t 后面是每个文件中包含该单词的数量 不同文件名用一个空格隔开
我们可以这样实现 使用两次的map-reduce 第一次map-reduce输入 jee--a.txt 4 这种格式
然后再下一个map-reduce 将第一次的map-reduce拆分开 然后组成最终需要的格式即可
Mapper1:
package com.jee.doublejob;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class WCMapper1 extends Mapper<LongWritable, Text,Text, IntWritable> {
private String fileName = new String();
private Text k = new Text();
private IntWritable v = new IntWritable(1);
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//获得map文件接收到的切片
FileSplit split = (FileSplit) context.getInputSplit();
//根据切片获得文件的名字
fileName = split.getPath().getName().toString();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] items = line.split("\t");
for(String item : items){
k.set(item + "--" + fileName);
context.write(k,v);
}
}
}
Reducer1:
package com.jee.doublejob;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
public class WCReducer1 extends Reducer<Text, IntWritable,Text,IntWritable> {
private IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
Iterator<IntWritable> iterator = values.iterator();
int t = 0;
while(iterator.hasNext()){
t += iterator.next().get();
}
v.set(t);
context.write(key,v);
}
}
Mapper2:
package com.jee.doublejob;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WCMapper2 extends Mapper<LongWritable, Text,Text,Text> {
private Text k = new Text();
private Text v = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] items = line.split("--");
k.set(items[0]);
String[] items2 = items[1].split("\t");
v.set(items2[0] + "-->" + items2[1]);
context.write(k,v);
}
}
Reducer2:
package com.jee.doublejob;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
public class WCReducer2 extends Reducer<Text,Text,Text,Text> {
private Text v = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
Iterator<Text> iterator = values.iterator();
String temp = "";
while (iterator.hasNext()){
temp += iterator.next().toString() + " ";
}
v.set(temp);
context.write(key,v);
}
}
Driver:
package com.jee.doublejob;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WCDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//第一个Job任务
Job job1 = Job.getInstance(new Configuration());
job1.setJarByClass(WCDriver.class);
job1.setMapperClass(WCMapper1.class);
job1.setReducerClass(WCReducer1.class);
job1.setMapOutputKeyClass(Text.class);
job1.setMapOutputValueClass(IntWritable.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job1,new Path("d:/Hadoop/input"));
FileOutputFormat.setOutputPath(job1,new Path("d:/Hadoop/output"));
boolean b = job1.waitForCompletion(true);
//第一个Job任务如果成功 执行第二个Job任务 第二个的输入是第一个Job任务的输出
if(b){
Job job2 = Job.getInstance(new Configuration());
job2.setJarByClass(WCDriver.class);
job2.setMapperClass(WCMapper2.class);
job2.setReducerClass(WCReducer2.class);
job2.setMapOutputKeyClass(Text.class);
job2.setMapOutputValueClass(Text.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job2,new Path("d:/Hadoop/output"));
FileOutputFormat.setOutputPath(job2,new Path("d:/Hadoop/output2"));
boolean b1 = job2.waitForCompletion(true);
System.exit(b1 ? 0 : 1);
}else{
System.exit(1);
}
}
}