Hadoop之MapReduce的Partition分区

数据处理后，如果想将处理的结果按照条件输出到不同的文件中（不同的文件的数据即是分区数据）

1、HashPartitioner（Hadoop自带的默认分区）

默认分区是根据key的HashCode对ReduceTasks个数取模得到的，用户无法控制具体的key存储到哪个分区

HashPartitioner源码如下：

public class HashPartitioner<K, V> extends Partitioner<K, V> {
    public HashPartitioner() {
    }

    public int getPartition(K key, V value, int numReduceTasks) {
        return (key.hashCode() & 2147483647) % numReduceTasks;
    }
}

2、自定义分区

（1）自定义类继承Partitioner，重写getPartition(K key, V value, int numReduceTasks)方法

public class OrderPartition extends Partitioner<Order,NullWritable>{

   @Override
   public int getPartition(Order order, NullWritable nullWritable, int numReduces) {
      return (order.getId().hashCode()&Integer.MAX_VALUE)%numReduces;
   }

}

（2）在job驱动类中设置自定义Partitioner

job.setPartitionerClass(OrderPartition.class);

（3）设置相应数量的ReduceTask

job.setNumReduceTasks(5);

3、注意点

（1）如果ReduceTask的数量（M）>getPartition的结果数（N），则会产生M-N个空的输出文件

（2）如果1<ReduceTask的数量（M）<getPartition的结果数（N）,则有一部分分区数据无处安放，会报Exception

（3）如果ReduceTask的数量（M）=1，则不管MapTask端输出多少个分区文件，最终结果都交给这一个ReduceTask，最终只会产生一个结果文件

（4）分区号从0开始，逐一累加

例子：假设自定义分区数为5
job.setNumReduceTasks(1);//正常运行，产生一个输出文件
job.setNumReduceTasks(2);//2<5，报错
job.setNumReduceTasks(6);//正常运行，产生6-5=1个空文件

4、案例

public class FlowMapper extends Mapper<LongWritable, Text,Text,Flow> {

    @Override
    protected void map(LongWritable key, Text line, Context context) throws IOException, InterruptedException {
        Flow flow=new Flow();
        String[] split = line.toString().split("\t");
        String phoneNum = split[1];
        flow.setUpFlow(Long.valueOf(split[split.length-3]));
        flow.setDownFlow(Long.valueOf(split[split.length-2]));
        flow.setSumFlow(flow.getUpFlow()+flow.getDownFlow());
        context.write(new Text(phoneNum),flow);
    }
}

public class FlowReducer extends Reducer<Text,Flow,Text,Flow> {

    @Override
    protected void reduce(Text key, Iterable<Flow> values, Context context) throws IOException, InterruptedException {
        Long upFlow=0L;
        Long downFlow=0L;
        Long sumFlow=0L;
        Iterator<Flow> iterator = values.iterator();
        while (iterator.hasNext()){
            Flow next = iterator.next();
            upFlow+=next.getUpFlow();
            downFlow+=next.getDownFlow();
            sumFlow+=next.getSumFlow();
        }
        Flow flow=new Flow();
        flow.setSumFlow(sumFlow);
        flow.setDownFlow(downFlow);
        flow.setUpFlow(upFlow);
        context.write(key,flow);
    }
}

public class ProvincePartitioner extends Partitioner<Text, Flow> {
    @Override
    public int getPartition(Text key, Flow flow, int i) {
        // 1 获取电话号码的前三位
        String preNum = key.toString().substring(0, 3);
        int partition = 5;
        // 2 判断是哪个省
        if ("136".equals(preNum)) {
            partition = 0;
        }else if ("137".equals(preNum)) {
            partition = 1;
        }else if ("138".equals(preNum)) {
            partition = 2;
        }else if ("139".equals(preNum)) {
            partition = 3;
        }else{
            partition=4;
        }
        return partition;
    }
}

public class Flow implements Writable ,Comparable<Flow>{

    private Long upFlow;//上行流量
    private Long downFlow;//下行流量
    private Long sumFlow;//总流量

    @Override
    public String toString() {
        return "Flow{" +
                "upFlow=" + upFlow +
                ", downFlow=" + downFlow +
                ", sumFlow=" + sumFlow +
                '}';
    }

    public Long getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(long upFlow) {
        this.upFlow = upFlow;
    }

    public Long getDownFlow() {
        return downFlow;
    }

    public void setDownFlow(long downFlow) {
        this.downFlow = downFlow;
    }

    public Long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow(long sumFlow) {
        this.sumFlow = sumFlow;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeLong(upFlow);
        dataOutput.writeLong(downFlow);
        dataOutput.writeLong(sumFlow);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
         upFlow=dataInput.readLong();
         downFlow=dataInput.readLong();
         sumFlow=dataInput.readLong();
    }

    @Override
    public int compareTo(Flow o) {
        return this.getSumFlow().compareTo(o.getSumFlow());
    }
}

public static void main(String[] args) throws Exception{
    System.setProperty("HADOOP_USER_NAME", "root");
    Configuration configuration=new Configuration();
    Job job = Job.getInstance(configuration);
    //指定本程序的jar包所在的本地路径
    job.setJarByClass(PartitionerDriver.class);
    //指定本业务job要使用的mapper/Reducer业务类
    job.setMapperClass(FlowMapper.class);
    job.setReducerClass(FlowReducer.class);
    //指定mapper输出数据的kv类型
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Flow.class);
    //指定最终输出的数据的kv类型
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Flow.class);
    //设置自定义Partitioner
    job.setPartitionerClass(ProvincePartitioner.class);
    //设置分区数
    job.setNumReduceTasks(5);
   //指定job的输入原始文件所在目录和输出文件目录
    FileInputFormat.setInputPaths(job,new Path("\\mapreduce\\flow\\phoneflow"));
    FileOutputFormat.setOutputPath(job,new Path("\\mapreduce\\flow\\output"));
    boolean completion = job.waitForCompletion(true);
    System.exit(completion?0:1);
}

执行结果：

[root@master mapreduce]# hdfs dfs -text /mapreduce/flow/output/part-r-00000
13630577991   Flow{upFlow=6960, downFlow=690, sumFlow=7650}
13682846555   Flow{upFlow=1938, downFlow=2910, sumFlow=4848}
[root@master mapreduce]# hdfs dfs -text /mapreduce/flow/output/part-r-00001
13729199489   Flow{upFlow=240, downFlow=0, sumFlow=240}
13736230513   Flow{upFlow=2481, downFlow=24681, sumFlow=27162}
13768778790   Flow{upFlow=120, downFlow=120, sumFlow=240}
[root@master mapreduce]# hdfs dfs -text /mapreduce/flow/output/part-r-00002
13846544121   Flow{upFlow=264, downFlow=0, sumFlow=264}
[root@master mapreduce]# hdfs dfs -text /mapreduce/flow/output/part-r-00003
13956435636   Flow{upFlow=132, downFlow=1512, sumFlow=1644}
13966251146   Flow{upFlow=240, downFlow=0, sumFlow=240}
13975057813   Flow{upFlow=11058, downFlow=48243, sumFlow=59301}
13992314666   Flow{upFlow=3008, downFlow=3720, sumFlow=6728}
[root@master mapreduce]# hdfs dfs -text /mapreduce/flow/output/part-r-00004
13470253144   Flow{upFlow=180, downFlow=180, sumFlow=360}
13509468723   Flow{upFlow=7335, downFlow=110349, sumFlow=117684}
13560439638   Flow{upFlow=918, downFlow=4938, sumFlow=5856}
13568436656   Flow{upFlow=3597, downFlow=25635, sumFlow=29232}
13590439668   Flow{upFlow=1116, downFlow=954, sumFlow=2070}
15043685818   Flow{upFlow=3659, downFlow=3538, sumFlow=7197}
15910133277   Flow{upFlow=3156, downFlow=2936, sumFlow=6092}
15959002129   Flow{upFlow=1938, downFlow=180, sumFlow=2118}
18271575951   Flow{upFlow=1527, downFlow=2106, sumFlow=3633}
18390173782   Flow{upFlow=9531, downFlow=2412, sumFlow=11943}
84188413   Flow{upFlow=4116, downFlow=1432, sumFlow=5548}

zuodaoyong

发布了63 篇原创文章 · 获赞 2 · 访问量 2761

私信关注