测试数据
Order_0000001 Pdt_01 222.8
Order_0000002 Pdt_05 722.4
Order_0000001 Pdt_05 25.8
Order_0000003 Pdt_01 333.8
Order_0000003 Pdt_01 33.8
Order_0000002 Pdt_03 522.8
Order_0000002 Pdt_04 122.4
需求分析
- 需求:现在需要求出每一个订单中最贵的商品。
- a、mapper端把bean对象作为key输出,value写出null就行了;
- b、bean对象的compareTo方法,先按照orderid排序,再按照金额排倒序;
- c、自定义GroupingComparator,reducer中分组策略,orderid相同就分为一组;
- d、bean对象中orderid相同调用一次reducer方法;
- e、同一个orderid的第一个bean对象(key)就是金额最大的对象;
- f、reducer默认机制就是把同一组中第一个object作为key;
- g、取出来就是key(bean对象)就是同一个订单号中金额最大的一笔。
OrderSortBean
package com.hadoop.mapreduce.ordersort;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
public class OrderSortBean implements WritableComparable<OrderSortBean> {
// 定义为org.apache.hadoop.io.*是为了方便其它地方调用compareTo比较。
private Text orderId;
private DoubleWritable price;
public OrderSortBean() {
}
public void setData(String orderId, double price) {
this.orderId = new Text(orderId);
this.price = new DoubleWritable(price);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(orderId.toString());
out.writeDouble(price.get());
}
@Override
public void readFields(DataInput in) throws IOException {
String orderId = in.readUTF();
double price = in.readDouble();
this.orderId = new Text(orderId);
this.price = new DoubleWritable(price);
}
/**
* 先按照订单ID排序,再按照金额排倒序。
*/
@Override
public int compareTo(OrderSortBean bean) {
// 1.先按订单id排序(从小到大)
int result = this.orderId.compareTo(bean.getOrderId());
if (result == 0) {
// 2.再按金额排序(从大到小)
result = -this.price.compareTo(bean.getPrice());
}
return result;
}
@Override
public String toString() {
return orderId.toString() + "\t" + price.get();
}
public Text getOrderId() {
return orderId;
}
public void setOrderId(Text orderId) {
this.orderId = orderId;
}
public DoubleWritable getPrice() {
return price;
}
public void setPrice(DoubleWritable price) {
this.price = price;
}
}
OrderSortGroupingComparator
package com.hadoop.mapreduce.ordersort;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* 利用reduce端的GroupingComparator来实现将一组bean看成相同的key
*/
public class OrderSortGroupingComparator extends WritableComparator {
// 传入作为key的bean的class类型,以及制定需要让框架做反射获取实例对象
public OrderSortGroupingComparator() {
super(OrderSortBean.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderSortBean abean = (OrderSortBean) a;
OrderSortBean bbean = (OrderSortBean) b;
// 比较两个bean时,将orderId相同的bean都视为一组
return abean.getOrderId().compareTo(bbean.getOrderId());
}
}
OrderSortMapper
package com.hadoop.mapreduce.ordersort;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class OrderSortMapper extends Mapper<LongWritable, Text, OrderSortBean, NullWritable> {
private OrderSortBean bean = new OrderSortBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("\t");
bean.setData(fields[0], Double.parseDouble(fields[2]));
context.write(bean, NullWritable.get());
}
}
OrderSortReducer
package com.hadoop.mapreduce.ordersort;
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class OrderSortReducer extends Reducer<OrderSortBean, NullWritable, OrderSortBean, NullWritable> {
@Override
protected void reduce(OrderSortBean key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
OrderSortDriver
package com.hadoop.mapreduce.ordersort;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 需求:现在需要求出每一个订单中最贵的商品。
*/
public class OrderSortDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
// 1.获取配置信息,和job对象实例信息
Configuration conf = this.getConf();
Job job = Job.getInstance(conf);
// 2.设置加载jar的位置
job.setJarByClass(OrderSortDriver.class);
// 3.指定本业务job要使用的mapper/Reducer业务类
job.setMapperClass(OrderSortMapper.class);
job.setReducerClass(OrderSortReducer.class);
// 4.指定mapper输出数据的key-value类型
job.setMapOutputKeyClass(OrderSortBean.class);
job.setMapOutputValueClass(NullWritable.class);
// 5.指定最终输出的数据的key-value类型
job.setOutputKeyClass(OrderSortBean.class);
job.setOutputValueClass(NullWritable.class);
// 6.指定job的输入原始文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
// 指定job的输出结果所在目录
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 8.设置reduce端的分组
job.setGroupingComparatorClass(OrderSortGroupingComparator.class);
// 7.提交
boolean result = job.waitForCompletion(true);
return result ? 0 : 1;
}
public static void main(String[] args) throws Exception {
if (args == null || args.length != 2) {
System.err.println("Usage: hadoop jar <jarname> <classname> <input path> <output path>");
System.exit(-1);
}
int ret = ToolRunner.run(new OrderSortDriver(), args);
System.exit(ret);
}
}
数据结果
Order_0000001 222.8
Order_0000002 722.4
Order_0000003 333.8