思想:
在map端找出map端的最大值(局部)。map的cleanup方法输出这个最大值,再通过reduce端将map端的局部最大进行比较。求出最终最大的那个值。
怎么样找出map端的最大值?老样子,先看数据!
销售时间 社保卡号 商品编码 商品名称 销售数量 应收金额 实收金额
2018-01-01 001616528 236701 强力VC银翘片 6.0 82.8 69.0
2018-01-01 0012697828 861464 复方利血平片(复方降压片) 4.0 10.0 9.4
2018-01-01 0010060654328 861458 复方利血平氨苯蝶啶片(北京降压0号) 1.0 10.3 9.2
2018-01-01 0011811728 861456 酒石酸美托洛尔片(倍他乐克) 1.0 7.0 6.3
2018-01-01 0013448228 861507 苯磺酸氨氯地平片(安内真) 1.0 9.5 8.5
数据准备了两个文件 file1 和file2 这样容易体现最后reduce的作用
需求是计算这个数据片段中的最大值,并输出药品名称和实收金额;
也就是要输出商品的名称和实收金额,并且找出金额最大值的商品
package com.hnxy.mr.max;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.hnxy.mr.max.MaxWrod.MaxMapper;
import com.hnxy.mr.max.MaxWrod.MaxReducer;
public class MaxWrod3 extends Configured implements Tool {
public static class MaxMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {
private Text outkey = new Text();
private DoubleWritable outval = new DoubleWritable();
private Double maxval = 0D;
private String maxkey = "";
String[] star = null;
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException {
// 数据 T2018-01-01 001616528 236701 强力VC银翘片 6.0 82.8 69.0
// 按照\t分割
star = value.toString().split("\t");
// 每行正确的数据数组的长度为7 并且部位null
if (star.length == 7 && null != star) {
// maxkey 如果小于数组第六为也就是实收金额 那么就maxval就等于这个数
if (maxval < Double.parseDouble(star[6])) {
maxval = Double.parseDouble(star[6]);
// 这时候maxkey就等于数组的第三位也就是药品名称
maxkey = star[3];
}
}
}
@Override
protected void cleanup(Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException {
// 这里为什么要用cleanup,因为cleanup只执行一次,而且是最后一次执行。因为要释放资源
// 输出最后最大值
outkey.set(maxkey);
outval.set(maxval);
context.write(outkey, outval);
}
}
public static class MaxReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {
private Text outkey = new Text();
private DoubleWritable outval = new DoubleWritable();
private Double maxval = 0D;
private String maxkey = "";
@Override
protected void reduce(Text key, Iterable<DoubleWritable> values,
Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException {
// 迭代器遍历所有的实收金额
DoubleWritable value = values.iterator().next();
// 最后进行判断 将多个map的最大值进行比较、
if (maxval < value.get()) {
maxval = value.get();
maxkey = key.toString();
}
}
@Override
protected void cleanup(Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
outkey.set(maxkey);
outval.set(maxval);
context.write(outkey, outval);
}
}
@Override
public int run(String[] args) throws Exception {
// 设置Configretion
Configuration conf = this.getConf();
// 设置job
Job job = Job.getInstance();
job.setJarByClass(MaxWrod3.class);
// map reduce类
job.setMapperClass(MaxMapper.class);
job.setReducerClass(MaxReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// 设置文件路径
Path in = new Path(args[0]);
Path out = new Path(args[1]);
// 设置hdfs操作对象
FileSystem fs = FileSystem.get(conf);
// 绑定文件输出输入目录
FileInputFormat.addInputPath(job, in);
FileOutputFormat.setOutputPath(job, out);
// 自动删除
if (fs.exists(out)) {
fs.delete(out, true);
// 提示
System.out.println(job.getJobName() + "'s Path Output is deleted");
}
// 执行
boolean con = job.waitForCompletion(true);
if (con) {
System.out.println("ok");
} else {
System.out.println("file");
}
// FileInputFormat.addInputPath(job, path);
return 0;
}
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
System.exit(ToolRunner.run(new MaxWrod3(), args));
}
}
第二种求最大值的方法:
我们可以将上面要输出的 药品的名字 和价格 用特殊符号链接成一个val 然后map阶段使用统一的key进行val的传输,这样reduce阶段就不再进行一次cleanup了
package com.hnxy.mr.max;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.hnxy.mr.max.MaxWrod.MaxMapper;
import com.hnxy.mr.max.MaxWrod.MaxReducer;
public class MaxWrod4 extends Configured implements Tool {
// 定义分隔符
private static final String SPLIT_STP = "\001";
public static class MaxMapper extends Mapper<LongWritable, Text, Text, Text> {
private Text outkey = new Text();
private Text outval = new Text();
private Double maxval = 0D;
private String maxkey = "";
String[] star = null;
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
// 数据 T2018-01-01 001616528 236701 强力VC银翘片 6.0 82.8 69.0
// 按照\t分割
star = value.toString().split("\t");
// 每行正确的数据数组的长度为7 并且部位null
if (star.length == 7 && null != star) {
// maxkey 如果小于数组第六为也就是实收金额 那么就maxval就等于这个数
if (maxval < Double.parseDouble(star[6])) {
maxval = Double.parseDouble(star[6]);
// 这时候maxkey就等于数组的第三位也就是药品名称
maxkey = star[3];
}
}
}
@Override
protected void cleanup(Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
// 这里为什么要用cleanup,因为cleanup只执行一次,而且是最后一次执行。因为要释放资源
// 输出最后最大值
outkey.set("max");
outval.set(maxkey + SPLIT_STP + maxval);
context.write(outkey, outval);
}
}
public static class MaxReducer extends Reducer<Text, Text, Text, DoubleWritable> {
private Text outkey = new Text();
private DoubleWritable outval = new DoubleWritable();
private Double maxval = 0D;
private String maxkey = "";
private String[] strs = null;
@Override
protected void reduce(Text key, Iterable<Text> values,
Reducer<Text, Text, Text, DoubleWritable>.Context context) throws IOException, InterruptedException {
// 迭代器遍历所有的实收金额
for (Text t : values) {
System.out.println(t);
strs = t.toString().split(SPLIT_STP);
// 最后进行判断 将多个map的最大值进行比较
if (maxval < Double.parseDouble(strs[1])) {
maxkey = strs[0];
maxval = Double.parseDouble(strs[1]);
}
}
outkey.set(maxkey);
outval.set(maxval);
context.write(outkey, outval);
}
}
@Override
public int run(String[] args) throws Exception {
// 设置Configretion
Configuration conf = this.getConf();
// 设置job
Job job = Job.getInstance();
job.setJarByClass(MaxWrod4.class);
// map reduce类
job.setMapperClass(MaxMapper.class);
job.setReducerClass(MaxReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// 设置文件路径
Path in = new Path(args[0]);
Path out = new Path(args[1]);
// 设置hdfs操作对象
FileSystem fs = FileSystem.get(conf);
// 绑定文件输出输入目录
FileInputFormat.addInputPath(job, in);
FileOutputFormat.setOutputPath(job, out);
// 自动删除
if (fs.exists(out)) {
fs.delete(out, true);
// 提示
System.out.println(job.getJobName() + "'s Path Output is deleted");
}
// 执行
boolean con = job.waitForCompletion(true);
if (con) {
System.out.println("ok");
} else {
System.out.println("file");
}
// FileInputFormat.addInputPath(job, path);
return 0;
}
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
System.exit(ToolRunner.run(new MaxWrod4(), args));
}
}
我还有第三种方法!map阶段使用combiner进行局部聚合 省去map阶段的cleanup
但是这里就不写了,因为map() 中输出所有数据,会产生大量的溢写排序合并,效率要比写的两个的低。
今天就到着这里,晚安( ̄▽ ̄)"