刚刚学习Hadoop不久,代码不够简练直接
思想:
利用hadoop自带的key排序功能,先让A任务统计出字母和字母次数,输出文档,然后依此文档作为B任务的输入,然后排序,将次数作为key字母作为value,实际上就是交换A任务输出文档的key和value。
package wordcount;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class mc {
public static void main(String[] args) throws Exception{
Configuration conf=new Configuration();
String [] otherargs=new GenericOptionsParser(conf,args).getRemainingArgs();
if(otherargs.length<2) {
System.out.println("Usage:wordcount <in> [<in>...] <out>");
System.exit(2);}
//构建任务对象
Job job=Job.getInstance(conf,"word count");
job.setJarByClass(mc.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
for(int i=0;i<otherargs.length-1;i++) {
//设置需要统计的文件输入路径
FileInputFormat.addInputPath(job,new Path(otherargs[i]));}
//老版本需要强转为jobConf
FileOutputFormat.setOutputPath((JobConf)job.getConfiguration(),new Path(otherargs[(otherargs.length-1)]));
boolean b = job.waitForCompletion(true);
//判断A任务是否结束,如果结束则进行下一个任务
if(b){
Configuration confs=new Configuration();
Job job2 = Job.getInstance(confs,"word count2");
job2.setJarByClass(mc.class);
job2.setMapperClass(TokenizerMapper2.class);
job2.setOutputKeyClass(IntWritable.class);
job2.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job2,new Path("hdfs:/output/part-r-00000"));
FileOutputFormat.setOutputPath((JobConf)job2.getConfiguration(),new Path("hdfs:/output2"));
boolean b1 = job2.waitForCompletion(true);
//如果B任务完成,则删除A任务留下的output保证一个文件夹输出
if(b1) {
FileSystem fs=FileSystem.get(conf);
Path path=new Path("hdfs:/output");
boolean isok=fs.deleteOnExit(path);
if(isok){
System.exit(b1 ? 0 : 1);
}}}else{
System.exit(1);
}}
public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,Reducer<Text,IntWritable,Text,IntWritable>.Context context) throws IOException,InterruptedException {
//统计单词总数量
int sum = 0;
for(IntWritable val:values) {
sum += val.get();
}
this.result.set(sum);
context.write(key,this.result);
}}
public static class TokenizerMapper2 extends Mapper<Object, Text, IntWritable, Text> {
IntWritable one = new IntWritable();
private Text word = new Text();
public void map(Object key, Text value, Context context) throws IOException,InterruptedException {
String string = new String(value.toString());
String[] stringsplit=string.split("\t");
// 以tab键划分
for(int i=0;i<stringsplit.length;i++) {
// 由A任务输出的数据可知:偶数为数字 奇数为字母
if(i%2==0) {
word.set(stringsplit[i]);
one.set(Integer.parseInt(stringsplit[i+1]));
context.write(one, this.word);
}}}}
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context) throws IOException,InterruptedException {
String string = new String(value.toString());
//文本处理
String str=string.replaceAll("[`éê£0123456789~!@#$%^&*()+=|{}':;',\\[\\].<>/?~!@#¥%……&;*()——+|{}【】‘;:”“’。,、?|-]", "");
//文本切割
str = str.trim();
//此处决定是按字母分割还是按单词分割
String[] stringsplit=str.split("");
//写入 <a,[1,1,1,……]>
for(int i=0;i<stringsplit.length;i++) {
word.set(stringsplit[i]);
context.write(this.word, one);
}}}}