数据准备
- 创建一个
creatadatas.sh
脚本
#!/bin/bash
for i in {1..1000};do
echo $RANDOM
done;
- 生成数据
$ sh createdatas.sh > data1
$ sh createdatas.sh > data2
$ sh createdatas.sh > data3
自定义分区
通过观察,数据分布在
[0,35000]
这个区间,因此, 设置key >20000
为第0分区,key >10000
为第1分区,其他的为第2分区。
package com.hadoop.totasort;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* 自定义分区
*/
public class MyPartitioner extends Partitioner<IntWritable,IntWritable> {
@Override
public int getPartition(IntWritable key, IntWritable value, int numPartitions) {
int keyInt = Integer.parseInt(key.toString());
if (keyInt >20000){
return 0;
}else if (keyInt >10000){
return 1;
}else {
return 2;
}
}
}
自定义排序
shuffle过程中默认的排序是
升序
的,我们需要的是倒排序
。
package com.hadoop.totasort;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* 自定义排序
*/
public class MySort extends WritableComparator {
public MySort(){ // 很重要的一个构造方法
super(IntWritable.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
IntWritable v1 = (IntWritable)a;
IntWritable v2 = (IntWritable)b;
return v2.compareTo(v1);
}
}
编写Mapper类
package com.hadoop.totasort;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class TotalSortMapper extends Mapper<LongWritable, Text, IntWritable,IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
int v = Integer.parseInt(value.toString());
// key使用v是为了分区,value使用v是为了传值
context.write(new IntWritable(v),new IntWritable(v));
}
}
编写Reducer类
package com.hadoop.totasort;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class TotalSortReducer extends Reducer<IntWritable,IntWritable, NullWritable,IntWritable> {
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
for (IntWritable value : values) {
context.write(NullWritable.get(),value);
}
}
}
编写驱动类
package com.hadoop.totasort;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TotalSortDriver {
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(TotalSortDriver.class);
job.setMapperClass(TotalSortMapper.class);
job.setReducerClass(TotalSortReducer.class);
//设置分区,排序,任务数
job.setPartitionerClass(MyPartitioner.class);
job.setSortComparatorClass(MySort.class);
job.setNumReduceTasks(3);
//其他设置
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job,new Path("input/totalsort"));
FileOutputFormat.setOutputPath(job,new Path("output/totalsort"));
job.waitForCompletion(true);
}
}
运行结果
生成三个文件,
part-r-00000
、part-r-00001
、part-r-00002