MapReduce的二次排序

这里介绍二次排序的思路整理，并附上具体代码
   首先要明确二次排序的基本概念:在我们所之前所熟悉的排序称为一次排序，即只对key进行排序
所以二次排序的概念在原来的基础上便不难理解，即对key进行排序的同时对value进行排序
（1）二次排序非标准版
   所谓二排非标准版：即所有的第二次排序过程全部放到Reduce进行操作，将所有相同key传进来的
values放进一个集合，然后调用Collection.sort方法对其进行排序，reduce阶段压力大，不建议使用
   代码实现：此处只附上reduce阶段的代码部分

     （2）二次排序标准版
   在标准版二次排序中，将原始的<key,value>对儿转换成新的Newkey，value
   其中Newkey由原来的<key,value>组成而value值不变，即形成了一个大key的概念
   Map 输出结构为<newKey,value>=<(key,value),value>
   1.由于引入了newkey，所以我们要自己定义newkey类，在其类中实现WritableComparable接口
   上面的接口中会要求我们实现其compareTo的方法，按照我们的需求，则该方法主体是在进行依次对
   key和value进行比较。
   2.自定义 partitioner ，保证分区的一致性 ( 例题中还是按工号做分区 )。
重点是继承Partitioner 类
   3.还要写一个分组类，重点是继承WritableComparator/RawComparator

   代码实现：
public class SecondSortV3 {
/**
* 自定义的 newKey
*/
public static class KeyPairWritable implements
WritableComparable<KeyPairWritable> {
// 组合 key,key1 是分区 key，key2 是二次排序 key
private String key1;
private int key2;
public KeyPairWritable() {
}
public KeyPairWritable(String key1, int key2) {
this.set(key1, key2);
}
// 一次性将两个 key 设置成完
public void set(String key1, int key2) {
this.key1 = key1;
this.key2 = key2;
}
// 当 map 端写出的时候的序列化方法,即 map 如何将对象写出去,保证与读取的顺序一致
@Override
public void write(DataOutput arg0) throws IOException {
arg0.writeUTF(key1);
arg0.writeInt(key2);}
// 在 reducer 读取数据时候的反序列化方法,即 reduce 如何将对象读取出来,保证与写入
的顺序一致
@Override
public void readFields(DataInput arg0) throws IOException {
this.key1 = arg0.readUTF();
this.key2 = arg0.readInt();
}
// 自定义比较器方法，先比较 key1,确定分区号。在分区号相同的情况下，去比较 key2
// 就不需要单独写一个 Comparator 了
public int compareTo(KeyPairWritable o) {
int compare = this.key1.compareTo(o.key1);
if (compare != 0) {
return compare;
} else {
// 降序排列，故将 o 放到前边即可
return Integer.valueOf(o.key2).compareTo(
Integer.valueOf(this.getkey2()));
}
}
public int getkey2() {
return key2;
}
public void setkey2(int key2) {
this.key2 = key2;
}
public String getkey1() {
return key1;
}
public void setkey1(String key1) {
this.key1 = key1;
}
}
// map 类，实现 map 函数
public static class LineProcessMapper extends
Mapper<Object, Text, KeyPairWritable, IntWritable> {
// 暂存每个传过来的词的值，省掉重复申请空间
private KeyPairWritable outputKey = new KeyPairWritable();
private IntWritable outputValue = new IntWritable();
// 核心 map 方法的具体实现,逐个<key,value>对去处理
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
// 通过 context 对象，将 map 的输出逐个输出
String tempLine = value.toString();
if (tempLine != null && tempLine.trim().length() > 0) {
String[] columnArray = tempLine.split("\\s");
outputKey.set(columnArray[0], Integer.parseInt(columnArray[1]));
outputValue.set(Integer.parseInt(columnArray[1]));
context.write(outputKey, outputValue);}
}
}
/**
* 自定义分区类，包证同 key 的记录,如 S1,S2 等,能映射到相同的 reduce 端去处理
*/
public static class SecondPartitioner extends
Partitioner<KeyPairWritable, IntWritable> {
// 采集默认的 HashPartiton 实现即可
@Override
public int getPartition(KeyPairWritable key, IntWritable value,
int numPartitions) {
/*
* 默认的实现 (key.hashCode() & Integer.MAX_VALUE) % numPartitions
* 让 key 中 first 字段作为分区依据
*/
return (key.getkey1().hashCode() & Integer.MAX_VALUE)
% numPartitions;
}
}
/**
* 在 shuffle 阶段的 sort 全局排序完成后，如何对数据记录进行分组
*/
public static class SecondSortGroupComparator extends WritableComparator {
// 对象 KeyPairWritable.class 注册，让比较器知道该对象并能够初始化
protected SecondSortGroupComparator() {
super(KeyPairWritable.class, true);
}
@Override
public int compare(WritableComparable first, WritableComparable second) {
if (first == null || second == null) {
return 0;
}
KeyPairWritable newKey1 = (KeyPairWritable) first;
KeyPairWritable newKey2 = (KeyPairWritable) second;
// 自定义按原始数据中第一个 key 分组
return newKey1.getkey1().compareTo(newKey2.getkey1());
}
}
// reduce 类，实现 reduce 函数
public static class SortReducer extends
Reducer<KeyPairWritable, IntWritable, Text, IntWritable> {
private Text outputKey = new Text();
// 核心 reduce 方法的具体实现,逐个<key,List(v1,v2)>去处理
public void reduce(KeyPairWritable keyPair,
Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
// 进来时已经排序完成
outputKey.set(keyPair.getkey1());for (IntWritable val : values) {
context.write(outputKey, val);
}
}
}
// 启动 mr 的 driver 方法
public static void main(String[] args) throws Exception {
// 得到集群配置参数
Configuration conf = new Configuration();
// 参数解析器
GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
String[] remainingArgs = optionParser.getRemainingArgs();
if ((remainingArgs.length != 2)) {
System.err
.println("Usage: yarn jar jar_path main_class_path -D 参数列表 <in>
<out>");
System.exit(2);
}
// 设置到本次的 job 实例中
Job job = Job.getInstance(conf, "天亮二次排序(标准版)");
// 指定本次执行的主类是 WordCount
job.setJarByClass(SecondSortV3.class);
// 指定 map 类
job.setMapperClass(LineProcessMapper.class);
// 指定 partition 类
job.setPartitionerClass(SecondPartitioner.class);
job.setGroupingComparatorClass(SecondSortGroupComparator.class);
// 指定 reducer 类
job.setReducerClass(SortReducer.class);
// 指定 job 输出的 key 和 value 的类型,如果 map 和 reduce 输出类型不完全相同，需要重
新设置 map 的 output 的 key 和 value 的 class 类型
job.setMapOutputKeyClass(KeyPairWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 指定输入数据的路径
FileInputFormat.addInputPath(job, new Path(remainingArgs[0]));
// 指定输出路径,并要求该输出路径一定是不存在的
FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1]));
// 指定 job 执行模式，等待任务执行完成后，提交任务的客户端才会退出!
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

MapReduce的二次排序

猜你喜欢