自定义分区实现全排序

数据准备

  • 创建一个creatadatas.sh脚本
#!/bin/bash
for i in {1..1000};do
    echo $RANDOM
done;
  • 生成数据
$ sh createdatas.sh > data1
$ sh createdatas.sh > data2
$ sh createdatas.sh > data3

自定义分区

通过观察,数据分布在[0,35000]这个区间,因此, 设置key >20000为第0分区,key >10000为第1分区,其他的为第2分区。

package com.hadoop.totasort;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * 自定义分区
 */
public class MyPartitioner extends Partitioner<IntWritable,IntWritable> {
    @Override
    public int getPartition(IntWritable key, IntWritable value, int numPartitions) {
        int keyInt = Integer.parseInt(key.toString());
        if (keyInt >20000){
            return 0;
        }else if (keyInt >10000){
            return 1;
        }else {
            return 2;
        }
    }
}

自定义排序

shuffle过程中默认的排序是升序的,我们需要的是倒排序

package com.hadoop.totasort;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * 自定义排序
 */
public class MySort extends WritableComparator {
    public MySort(){ // 很重要的一个构造方法
        super(IntWritable.class,true); 
    }
    
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        IntWritable v1 = (IntWritable)a;
        IntWritable v2 = (IntWritable)b;
        return v2.compareTo(v1);
    }
}

编写Mapper类

package com.hadoop.totasort;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class TotalSortMapper extends Mapper<LongWritable, Text, IntWritable,IntWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        int v = Integer.parseInt(value.toString());
        // key使用v是为了分区,value使用v是为了传值
        context.write(new IntWritable(v),new IntWritable(v));
    }
}

编写Reducer类

package com.hadoop.totasort;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class TotalSortReducer extends Reducer<IntWritable,IntWritable, NullWritable,IntWritable> {
    @Override
    protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        for (IntWritable value : values) {
            context.write(NullWritable.get(),value);
        }
    }
}

编写驱动类

package com.hadoop.totasort;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TotalSortDriver {
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(TotalSortDriver.class);
        job.setMapperClass(TotalSortMapper.class);
        job.setReducerClass(TotalSortReducer.class);
        //设置分区,排序,任务数
        job.setPartitionerClass(MyPartitioner.class);
        job.setSortComparatorClass(MySort.class);
        job.setNumReduceTasks(3);
        //其他设置
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.setInputPaths(job,new Path("input/totalsort"));
        FileOutputFormat.setOutputPath(job,new Path("output/totalsort"));
        job.waitForCompletion(true);
    }
}

运行结果

生成三个文件,part-r-00000part-r-00001part-r-00002

猜你喜欢

转载自www.cnblogs.com/JZTX123/p/10664163.html