代码地址:
https://gitee.com/tanghongping/hadoopMapReduce/tree/master/src/com/thp/bigdata/secondarySort
订单id | 商品id | 成交金额 |
---|---|---|
Order_0000001 | Pdt_01 | 222.8 |
Order_0000001 | Pdt_05 | 25.8 |
Order_0000002 | Pdt_03 | 522.8 |
Order_0000002 | Pdt_04 | 122.4 |
Order_0000002 | Pdt_05 | 722.4 |
Order_0000003 | Pdt_01 | 222.8 |
现在需要求出每一个订单中成交金额最大的一笔交易
分析:
相同的订单id必须到同一个reduce去才能进行统计出每个订单中数量最大的那笔。
写一个Partition方法,只要是订单相同的就让他们到同一个reduce中。
但是传递过去的给同一个reduce进行处理的数据都是相同的订单id,但是却是三个不同的bean,三个bean是不能看成一个key的。
OrderBean:
package com.thp.bigdata.secondarySort;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
/**
* 订单
* @author 汤小萌
*
*/
public class OrderBean implements WritableComparable<OrderBean>{
private Text itemId; // 订单id
private DoubleWritable mount; // 订单数量
public OrderBean() {}
public OrderBean(Text itemId, DoubleWritable mount) {
set(itemId, mount);
}
public void set(Text itemId, DoubleWritable mount) {
this.itemId = itemId;
this.mount = mount;
}
public Text getItemId() {
return itemId;
}
public void setItemId(Text itemId) {
this.itemId = itemId;
}
public DoubleWritable getMount() {
return mount;
}
public void setMount(DoubleWritable mount) {
this.mount = mount;
}
@Override
public String toString() {
return itemId + "\t" + mount.get();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(itemId.toString());
out.writeDouble(mount.get());
}
@Override
public void readFields(DataInput in) throws IOException {
this.itemId = new Text(in.readUTF());
this.mount = new DoubleWritable(in.readDouble());
}
// 【注意:】
// 这个方法是进行排序的
/**
* 在内存往外溢出的时候需要调用比较方法进行排序
* 在文件进行合并 merge 的时候也需要调用比较方法进行排序
*/
@Override
public int compareTo(OrderBean o) {
int cmp = this.itemId.compareTo(o.getItemId());
if(cmp == 0) {
// 加上了 - 号 就变成了倒序排序了 从大往小排序
cmp = -this.mount.compareTo(o.mount);
}
return cmp;
}
}
ItemIdPartitioner :
package com.thp.bigdata.secondarySort;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
import com.thp.bigdata.secondarySort.OrderBean;
/**
* 自定义的Paritioner:
* 让相同的id分到相同的partition 进行处理
* @author 汤小萌
*
*/
public class ItemIdPartitioner extends Partitioner<OrderBean, NullWritable> {
/**
* 相同id的OrderBean会发往相同的parttion
* 而且产生的分区数,是会跟用户设置的 reduce task保持一致
* numPartitions 就是 设置的 reduce task
*/
@Override
public int getPartition(OrderBean bean, NullWritable value, int numPartitions) {
return (bean.getItemId().hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
ItemIdGroupingComparator :
package com.thp.bigdata.secondarySort;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import com.thp.bigdata.secondarySort.OrderBean;
/**
* 利用reduce端的ItemIdGroupingComparator来实现将相同的id的OrderBean看成相同的Key
* @author 汤小萌
*
*/
public class ItemIdGroupingComparator extends WritableComparator {
// 这个构造方法是一定要有的
// 传入作为key的bean的class类型,已经制定主要让框架做反射的实例对象
protected ItemIdGroupingComparator() {
super(OrderBean.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean aBean = (OrderBean) a;
OrderBean bBean = (OrderBean) b;
// 相同的orderId就认为是相同的key
return aBean.getItemId().compareTo(bBean.getItemId());
}
}
MapReduce过程
package com.thp.bigdata.secondarySort;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class SecondarySort {
/**
* Order_0000001,Pdt_01,222.8
Order_0000001,Pdt_05,25.8
Order_0000002,Pdt_05,325.8
Order_0000002,Pdt_03,522.8
Order_0000002,Pdt_04,122.4
Order_0000003,Pdt_01,222.8
*
* 由于Orderbean定义了compareTo方法,所以在shuffle阶段就会进行排序
* 接下来就是要使用自定义的partitioner进行分区
* 我们进行分区的目的是要将相同的id的OrderBean发往相同的partition进行处理
* 每一个partition拿到的都是相同的id的OrderBean
* 但是key却不是一样的,我们现在要欺骗parition,让它以为相同id的OrderBean都是相同的key
* 那么处理的时候,就会只保留第一个key,就是我们之前排序好放在最前面的key就是这个id下的订单数量最高的OrderBean
*
*/
static class SecondarySortMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable> {
OrderBean bean = new OrderBean();
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
String line = value.toString();
System.out.println(line);
String[] fields = line.split(",");
// System.out.println(fields[0] + " -- " + fields[2]);
bean.set(new Text(fields[0]), new DoubleWritable(Double.parseDouble(fields[2])));
// System.out.println(bean.getItemId());
context.write(bean, NullWritable.get());
}
}
static class SecondarySortReducer extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable> {
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SecondarySort.class);
job.setMapperClass(SecondarySortMapper.class);
job.setReducerClass(SecondarySortReducer.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("f:/order/input"));
FileOutputFormat.setOutputPath(job, new Path("f:/order/output"));
job.setGroupingComparatorClass(ItemIdGroupingComparator.class);
job.setPartitionerClass(ItemIdPartitioner.class);
job.setNumReduceTasks(3);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}