MapReduce排序分组（二次排序）

这里写图片描述

Step1.4第四步中需要对不同分区中的数据进行排序和分组，默认情况按照key进行排序和分组

二次排序

在map阶段

1.使用job.setInputFormatClass定义的InputFormat将输入的数据集分割成小数据块
调用自定义Map的map方法，将一个个<LongWritable, Text>对输入给Map的map方法。
输出应该符合自定义Map中定义的输出<IntPair, IntWritable>。
最终生成一个List<IntPair, IntWritable>。

2.在map阶段的最后，会先调用job.setPartitionerClass对这个List进行分区，每个分区映射到一个reducer，
每个分区内又调用job.setSortComparatorClass设置的key比较函数类排序，是一个二次排序。

如果没有通过job.setSortComparatorClass设置key比较函数类，则使用key的实现的compareTo方法。
使用IntPair实现的compareTo方法。

在reduce阶段

1.reducer接收到所有映射到这个reducer的map输出后，也是会调用job.setSortComparatorClass
设置的key比较函数类对所有数据对排序

2.然后开始构造一个key对应的value迭代器，使用job.setGroupingComparatorClass设置的分组函数类
只要这个比较器比较的两个key相同，他们就属于同一个组，它们的value放在一个value迭代器

3.最后进入Reducer的reduce方法，reduce方法的输入是所有的（key和它的value迭代器）

实现目标：

任务：
数据文件中，如果按照第一列升序排列，
当第一列相同时，第二列升序排列
如果当第一列相同时，求出第二列的最小值

1、p.dat文件内容：
xm@master:~$ hadoop fs -text /a/p.dat
2 2
1 2
3 3
3 2
3 1
1 3
1 1
2 3
2 1

2、输出结果：
1   1
1   2
1   3
2   1
2   2
2   3
3   1
3   2
3   3

任务分析：封装一个自定义类型作为key的新类型：将第一列与第二列都作为key

任务实现方法：
WritableComparable接口
定义：
public interface WritableComparable<T> extends Writable, Comparable<T> {}
自定义类型MyNewKey实现了WritableComparable的接口，
该接口中有一个compareTo()方法，当对key进行比较时会调用该方法，
而我们将其改为了我们自己定义的比较规则，从而实现我们想要的效果。

实现代码：

//MyNewKey.java:

package mr;

import java.io .DataInput;
import java.io .DataOutput;
import java.io .IOException;

import org.apache.hadoop.io .WritableComparable;

public class MyNewKey implements WritableComparable <MyNewKey>{
    Long fistname;
    Long secondname;

    public MyNewKey() {
    }
    public MyNewKey(Long fist, Long second) {
        fistname = fist;
        secondname = second;
    }

    public Long getFistname() {
        return fistname;
    }

    public Long getSecondname() {
        return secondname;
    }

    //反序列化，从流中的二进制转换成IntPair  
    @Override
    public void readFields(DataInput in) throws IOException {
        // TODO Auto-generated method stub
        fistname = in.readLong();
        secondname = in.readLong();
    }

    //序列化，将IntPair转化成使用流传送的二进制
    @Override
    public void write(DataOutput out) throws IOException {
        // TODO Auto-generated method stub
        out.writeLong(fistname);
        out.writeLong(secondname);
    }

    @Override
    public int compareTo(MyNewKey another) {
        // TODO Auto-generated method stub
        long min = fistname - another.fistname;
        if(min !=0 ) {
            return (int)min;
        } else {
            return (int)(secondname - another.secondname);
        }
    }

}

//MyText.java:

package mr;

import java.io .IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io .LongWritable;
import org.apache.hadoop.io .NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MyTest2 {

    static String INPUT_PATH = "hdfs://master:9000/a/p.dat";
    static String OUTPUT_PATH = "hdfs://master:9000/output";

    static class MyMapper extends Mapper<Object, Object, MyNewKey, NullWritable>{

        NullWritable out_value = NullWritable.get();
        protected void map(Object key, Object value, Context context)
                throws IOException, InterruptedException {

            String[] arr = value.toString().split(" ", 2);
            MyNewKey newkey = new MyNewKey(Long.parseLong(arr[0]), Long.parseLong(arr[1]));
            context.write(newkey, out_value);
        }
    }
    static class MyReduce extends Reducer<MyNewKey, NullWritable, LongWritable, LongWritable> {
        LongWritable tokenkey = new LongWritable();
        LongWritable tokenvalue = new LongWritable();

        protected void reduce(MyNewKey key, Iterable<NullWritable> values, Context context)
                throws java.io .IOException, java.lang.InterruptedException {

            tokenkey.set(key.getFistname());
            tokenvalue.set(key.getSecondname());

            context.write(tokenkey, tokenvalue);
        }
    }

    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub

        System.setProperty("hadoop.home.dir", "E:/Eclipse/eclipse/hadoop2.6_Win_x64-master");
        Path outputpath = new Path(OUTPUT_PATH);
        Configuration conf = new Configuration();
        FileSystem fs = outputpath.getFileSystem(conf);

        if(fs.exists(outputpath)){
            fs.delete(outputpath,true);
        }
        conf.set("fs.default.name ", "hdfs://master:9000/");

        Job job = Job.getInstance(conf);

        FileInputFormat.setInputPaths(job, INPUT_PATH);
        FileOutputFormat.setOutputPath(job, outputpath);

        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReduce.class);

        job.setMapOutputKeyClass(MyNewKey.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(LongWritable.class);

        job.waitForCompletion(true);
    }
}

实现目标：

任务：
求出第一列相同时第二列的最小值

1、p.dat文件内容：
xm@master:~$ hadoop fs -text /a/p.dat
2 2
1 2
3 3
3 2
3 1
1 3
1 1
2 3
2 1

2、输出结果：
1   1
2   1
3   1

任务实现流程分析：

自定义分组：
为了针对新的key类型作分组，我们也需要自定义一下分组规则：

private static class MyGroupingComparator implements RawComparator<MyNewKey> {
        /*
         * 基本分组规则：按第一列firstNum进行分组
         */
        @Override
        public int compare(MyNewKey key1, MyNewKey key2) {
            return (int) (key1.firstNum - key2.firstNum);
        }

        /*
         * @param b1 表示第一个参与比较的字节数组
         * 
         * @param s1 表示第一个参与比较的字节数组的起始位置
         * 
         * @param l1 表示第一个参与比较的字节数组的偏移量
         * 
         * @param b2 表示第二个参与比较的字节数组
         * 
         * @param s2 表示第二个参与比较的字节数组的起始位置
         * 
         * @param l2 表示第二个参与比较的字节数组的偏移量
         */
        @Override
        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            return WritableComparator.compareBytes(b1, s1, 8, b2, s2, 8);
        }
    }

1.MyGroupingComparator实现这两个接口
  RawComparator中的compare()方法是基于字节的比较
  Comparator中的compare()方法是基于对象的比较

  由于在MyNewKey中有两个long类型，每个long类型又占8个字节。
  这里因为比较的是第一列数字，所以读取的偏移量为8字节。

2.添加对分组规则的设置：
　　// 设置自定义分组规则
   job.setGroupingComparatorClass(MyGroupingComparator.class);

任务实现：
与上面排序不同的是，我们又写了一个类MyGroupingComparator实现接口方法，然后job中设置自定义分组规则即可。

//MyGroupingComparator.java:

package sort;

import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparator;

public class MyGroupingComparator implements RawComparator<MyNewKey>{

    //一个字节一个字节的比，直到找到一个不相同的字节，然后比这个字节的大小作为两个字节流的大小比较结果。
    @Override
    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
        // TODO Auto-generated method stub
         return WritableComparator.compareBytes(b1, s1, 8, b2, s2, 8);
    }

    //只要first相同就属于同一个组
    @Override
    public int compare(MyNewKey key1, MyNewKey key2) {
        // TODO Auto-generated method stub
        return (int) (key1.getFistname() - key2.getFistname());
    }
}

job设置自定义分组规则：
job.setGroupingComparatorClass(MyGroupingComparator.class);

一般方法：

package sort;

import java.io .IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io .LongWritable;
import org.apache.hadoop.io .NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class Text_simple {

    static String INPUT_PATH = "hdfs://master:9000/a/p.dat";
    static String OUTPUT_PATH = "hdfs://master:9000/output";

    static class MyMapper extends Mapper<Object, Object, IntWritable, IntWritable>{

        IntWritable out_key = new IntWritable();
        IntWritable out_value = new IntWritable();

        protected void map(Object key, Object value, Context context)throws IOException, InterruptedException {

            String[] arr = value.toString().split(" ", 2);

            int a = Integer.parseInt(arr[0]);
            int b = Integer.parseInt(arr[1]);

            context.write(new IntWritable(a),new IntWritable(b));

        }
    }
    static class MyReduce extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {

        IntWritable tokenkey = new IntWritable();
        IntWritable tokenvalue = new IntWritable();
        int a = Integer.MAX_VALUE;

        protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context)throws IOException, java.lang.InterruptedException {

            for(IntWritable c:values){
                if(c.get()<a){
                    a = c.get();
                }
            }

            context.write(key, new IntWritable(a));
        }
    }

    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub

        System.setProperty("hadoop.home.dir", "E:/Eclipse/eclipse/hadoop2.6_Win_x64-master");
        Path outputpath = new Path(OUTPUT_PATH);
        Configuration conf = new Configuration();
        FileSystem fs = outputpath.getFileSystem(conf);

        if(fs.exists(outputpath)){
            fs.delete(outputpath,true);
        }
        conf.set("fs.default.name ", "hdfs://master:9000/");

        Job job = Job.getInstance(conf);

        FileInputFormat.setInputPaths(job, INPUT_PATH);
        FileOutputFormat.setOutputPath(job, outputpath);

        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReduce.class);


        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);

        job.waitForCompletion(true);
    }
}

MapReduce排序分组（二次排序）

猜你喜欢