数据处理后,如果想将处理的结果按照条件输出到不同的文件中(不同的文件的数据即是分区数据)
1、HashPartitioner(Hadoop自带的默认分区)
默认分区是根据key的HashCode对ReduceTasks个数取模得到的,用户无法控制具体的key存储到哪个分区
HashPartitioner源码如下:
public class HashPartitioner<K, V> extends Partitioner<K, V> { public HashPartitioner() { } public int getPartition(K key, V value, int numReduceTasks) { return (key.hashCode() & 2147483647) % numReduceTasks; } }
2、自定义分区
(1)自定义类继承Partitioner,重写getPartition(K key, V value, int numReduceTasks)方法
public class OrderPartition extends Partitioner<Order,NullWritable>{ @Override public int getPartition(Order order, NullWritable nullWritable, int numReduces) { return (order.getId().hashCode()&Integer.MAX_VALUE)%numReduces; } }
(2)在job驱动类中设置自定义Partitioner
job.setPartitionerClass(OrderPartition.class);
(3)设置相应数量的ReduceTask
job.setNumReduceTasks(5);
3、注意点
(1)如果ReduceTask的数量(M)>getPartition的结果数(N),则会产生M-N个空的输出文件
(2)如果1<ReduceTask的数量(M)<getPartition的结果数(N),则有一部分分区数据无处安放,会报Exception
(3)如果ReduceTask的数量(M)=1,则不管MapTask端输出多少个分区文件,最终结果都交给这一个ReduceTask,最终只会产生一个结果文件
(4)分区号从0开始,逐一累加
例子:假设自定义分区数为5 job.setNumReduceTasks(1);//正常运行,产生一个输出文件 job.setNumReduceTasks(2);//2<5,报错 job.setNumReduceTasks(6);//正常运行,产生6-5=1个空文件
4、案例
public class FlowMapper extends Mapper<LongWritable, Text,Text,Flow> { @Override protected void map(LongWritable key, Text line, Context context) throws IOException, InterruptedException { Flow flow=new Flow(); String[] split = line.toString().split("\t"); String phoneNum = split[1]; flow.setUpFlow(Long.valueOf(split[split.length-3])); flow.setDownFlow(Long.valueOf(split[split.length-2])); flow.setSumFlow(flow.getUpFlow()+flow.getDownFlow()); context.write(new Text(phoneNum),flow); } }
public class FlowReducer extends Reducer<Text,Flow,Text,Flow> { @Override protected void reduce(Text key, Iterable<Flow> values, Context context) throws IOException, InterruptedException { Long upFlow=0L; Long downFlow=0L; Long sumFlow=0L; Iterator<Flow> iterator = values.iterator(); while (iterator.hasNext()){ Flow next = iterator.next(); upFlow+=next.getUpFlow(); downFlow+=next.getDownFlow(); sumFlow+=next.getSumFlow(); } Flow flow=new Flow(); flow.setSumFlow(sumFlow); flow.setDownFlow(downFlow); flow.setUpFlow(upFlow); context.write(key,flow); } }
public class ProvincePartitioner extends Partitioner<Text, Flow> { @Override public int getPartition(Text key, Flow flow, int i) { // 1 获取电话号码的前三位 String preNum = key.toString().substring(0, 3); int partition = 5; // 2 判断是哪个省 if ("136".equals(preNum)) { partition = 0; }else if ("137".equals(preNum)) { partition = 1; }else if ("138".equals(preNum)) { partition = 2; }else if ("139".equals(preNum)) { partition = 3; }else{ partition=4; } return partition; } }
public class Flow implements Writable ,Comparable<Flow>{ private Long upFlow;//上行流量 private Long downFlow;//下行流量 private Long sumFlow;//总流量 @Override public String toString() { return "Flow{" + "upFlow=" + upFlow + ", downFlow=" + downFlow + ", sumFlow=" + sumFlow + '}'; } public Long getUpFlow() { return upFlow; } public void setUpFlow(long upFlow) { this.upFlow = upFlow; } public Long getDownFlow() { return downFlow; } public void setDownFlow(long downFlow) { this.downFlow = downFlow; } public Long getSumFlow() { return sumFlow; } public void setSumFlow(long sumFlow) { this.sumFlow = sumFlow; } @Override public void write(DataOutput dataOutput) throws IOException { dataOutput.writeLong(upFlow); dataOutput.writeLong(downFlow); dataOutput.writeLong(sumFlow); } @Override public void readFields(DataInput dataInput) throws IOException { upFlow=dataInput.readLong(); downFlow=dataInput.readLong(); sumFlow=dataInput.readLong(); } @Override public int compareTo(Flow o) { return this.getSumFlow().compareTo(o.getSumFlow()); } }
public static void main(String[] args) throws Exception{ System.setProperty("HADOOP_USER_NAME", "root"); Configuration configuration=new Configuration(); Job job = Job.getInstance(configuration); //指定本程序的jar包所在的本地路径 job.setJarByClass(PartitionerDriver.class); //指定本业务job要使用的mapper/Reducer业务类 job.setMapperClass(FlowMapper.class); job.setReducerClass(FlowReducer.class); //指定mapper输出数据的kv类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Flow.class); //指定最终输出的数据的kv类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(Flow.class); //设置自定义Partitioner job.setPartitionerClass(ProvincePartitioner.class); //设置分区数 job.setNumReduceTasks(5); //指定job的输入原始文件所在目录和输出文件目录 FileInputFormat.setInputPaths(job,new Path("\\mapreduce\\flow\\phoneflow")); FileOutputFormat.setOutputPath(job,new Path("\\mapreduce\\flow\\output")); boolean completion = job.waitForCompletion(true); System.exit(completion?0:1); }
执行结果:
[root@master mapreduce]# hdfs dfs -text /mapreduce/flow/output/part-r-00000
13630577991 Flow{upFlow=6960, downFlow=690, sumFlow=7650}
13682846555 Flow{upFlow=1938, downFlow=2910, sumFlow=4848}
[root@master mapreduce]# hdfs dfs -text /mapreduce/flow/output/part-r-00001
13729199489 Flow{upFlow=240, downFlow=0, sumFlow=240}
13736230513 Flow{upFlow=2481, downFlow=24681, sumFlow=27162}
13768778790 Flow{upFlow=120, downFlow=120, sumFlow=240}
[root@master mapreduce]# hdfs dfs -text /mapreduce/flow/output/part-r-00002
13846544121 Flow{upFlow=264, downFlow=0, sumFlow=264}
[root@master mapreduce]# hdfs dfs -text /mapreduce/flow/output/part-r-00003
13956435636 Flow{upFlow=132, downFlow=1512, sumFlow=1644}
13966251146 Flow{upFlow=240, downFlow=0, sumFlow=240}
13975057813 Flow{upFlow=11058, downFlow=48243, sumFlow=59301}
13992314666 Flow{upFlow=3008, downFlow=3720, sumFlow=6728}
[root@master mapreduce]# hdfs dfs -text /mapreduce/flow/output/part-r-00004
13470253144 Flow{upFlow=180, downFlow=180, sumFlow=360}
13509468723 Flow{upFlow=7335, downFlow=110349, sumFlow=117684}
13560439638 Flow{upFlow=918, downFlow=4938, sumFlow=5856}
13568436656 Flow{upFlow=3597, downFlow=25635, sumFlow=29232}
13590439668 Flow{upFlow=1116, downFlow=954, sumFlow=2070}
15043685818 Flow{upFlow=3659, downFlow=3538, sumFlow=7197}
15910133277 Flow{upFlow=3156, downFlow=2936, sumFlow=6092}
15959002129 Flow{upFlow=1938, downFlow=180, sumFlow=2118}
18271575951 Flow{upFlow=1527, downFlow=2106, sumFlow=3633}
18390173782 Flow{upFlow=9531, downFlow=2412, sumFlow=11943}
84188413 Flow{upFlow=4116, downFlow=1432, sumFlow=5548}