Hadoop-Mapreduce实战(辅助排序和二次排序案例(GroupingComparator))

辅助排序和二次排序案例(GroupingComparator)

  • 需求

    有如下订单

订单id 商品id 成交金额
0000001 Pdt_01 222.8
0000001 Pdt_06 25.8
0000002 Pdt_03 522.8
0000002 Pdt_04 122.4
0000002 Pdt_05 722.4
0000003 Pdt_01 222.8
0000003 Pdt_02 33.8
  • 现在需要求出每一个订单中最贵的商品。

  • 输入数据

    输出数据预期

  • 分析

    • 利用“订单id和成交金额”作为key,可以将map阶段读取到的所有订单数据按照id分区,按照金额排序,发送到reduce。
    • 在reduce端利用groupingcomparator将订单id相同的kv聚合成组,然后取第一个即是最大值。
      在这里插入图片描述4)代码实现
  • 定义订单信息OrderBean

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

@Getter
@Setter
@AllArgsConstructor
@NoArgsConstructor
public class OrderBean implements WritableComparable<OrderBean> {
    
    
    // 订单id号
    private int order_id;
    // 价格
    private double price;

    @Override
    public String toString() {
    
    
        return order_id + "\t" + price;
    }

    @Override
    public int compareTo(OrderBean o) {
    
    
        int result;

        if (this.order_id > o.getOrder_id()) {
    
    
            result = 1;
        } else if (this.order_id < o.getOrder_id()) {
    
    
            result = -1;
        } else {
    
    
            result = this.price > o.getPrice() ? -1 : 1;
        }
        return result;
    }

    @Override
    public void write(DataOutput out) throws IOException {
    
    
        out.writeInt(order_id);
        out.writeDouble(price);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
    
    
        in.readInt();
        in.readDouble();
    }
}

编写OrderSortMapper

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class OrderMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable> {
    
    
	OrderBean k = new OrderBean();
	
	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    
    
		
		// 1 获取一行
		String line = value.toString();
		
		// 2 截取
		String[] fields = line.split("\t");
		
		// 3 封装对象
		k.setOrder_id(Integer.parseInt(fields[0]));
		k.setPrice(Double.parseDouble(fields[2]));
		
		// 4 写出
		context.write(k, NullWritable.get());
	}
}

编写OrderSortPartitioner

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class OrderPartitioner extends Partitioner<OrderBean, NullWritable> {
    
    

	@Override
	public int getPartition(OrderBean key, NullWritable value, int numReduceTasks) {
    
    
		
		return (key.getOrder_id() & Integer.MAX_VALUE) % numReduceTasks;
	}
}

编写OrderSortGroupingComparator

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class OrderGroupingComparator extends WritableComparator {
    
    

	protected OrderGroupingComparator() {
    
    
		super(OrderBean.class, true);
	}

	@SuppressWarnings("rawtypes")
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
    
    

		OrderBean aBean = (OrderBean) a;
		OrderBean bBean = (OrderBean) b;

		int result;
		if (aBean.getOrder_id() > bBean.getOrder_id()) {
    
    
			result = 1;
		} else if (aBean.getOrder_id() < bBean.getOrder_id()) {
    
    
			result = -1;
		} else {
    
    
			result = 0;
		}

		return result;
	}
}

编写OrderSortReducer

import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

public class OrderReducer extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable> {
    
    

	@Override
	protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context)
			throws IOException, InterruptedException {
    
    
		
		context.write(key, NullWritable.get());
	}
}

编写OrderSortDriver

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class OrderDriver {
    
    

	public static void main(String[] args) throws Exception {
    
    

		// 1 获取配置信息
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

		// 2 设置jar包加载路径
		job.setJarByClass(OrderDriver.class);

		// 3 加载map/reduce类
		job.setMapperClass(OrderMapper.class);
		job.setReducerClass(OrderReducer.class);

		// 4 设置map输出数据key和value类型
		job.setMapOutputKeyClass(OrderBean.class);
		job.setMapOutputValueClass(NullWritable.class);

		// 5 设置最终输出数据的key和value类型
		job.setOutputKeyClass(OrderBean.class);
		job.setOutputValueClass(NullWritable.class);

		// 6 设置输入数据和输出数据路径
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		// 10 设置reduce端的分组
		job.setGroupingComparatorClass(OrderGroupingComparator.class);

		// 7 设置分区
		job.setPartitionerClass(OrderPartitioner.class);

		// 8 设置reduce个数
		job.setNumReduceTasks(3);

		// 9 提交
		boolean result = job.waitForCompletion(true);
		System.exit(result ? 0 : 1);
	}
}

小文件处理(自定义InputFormat)

  • 需求

    无论hdfs还是mapreduce,对于小文件都有损效率,实践中,又难免面临处理大量小文件的场景,此时,就需要有相应解决方案。将多个小文件合并成一个文件SequenceFile,SequenceFile里面存储着多个文件,存储的形式为文件路径+名称为key,文件内容为value。

  • 输入数据

    最终预期文件格式:

  • 分析

    小文件的优化无非以下几种方式:

    • 在数据采集的时候,就将小文件或小批数据合成大文件再上传HDFS
    • 在业务处理之前,在HDFS上使用mapreduce程序对小文件进行合并
    • 在mapreduce处理时,可采用CombineTextInputFormat提高效率
  • 具体实现

    本节采用自定义InputFormat的方式,处理输入小文件的问题。

    • 自定义一个类继承FileInputFormat
    • 改写RecordReader,实现一次读取一个完整文件封装为KV
    • 在输出时使用SequenceFileOutPutFormat输出合并文件
  • 程序实现

    • 自定义InputFromat
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

// 定义类继承FileInputFormat
public class WholeFileInputformat extends FileInputFormat<NullWritable, BytesWritable>{
    
    
	
	@Override
	protected boolean isSplitable(JobContext context, Path filename) {
    
    
		return false;
	}

	@Override
	public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
			throws IOException, InterruptedException {
    
    
		
		WholeRecordReader recordReader = new WholeRecordReader();
		recordReader.initialize(split, context);
		
		return recordReader;
	}
}

自定义RecordReader

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class WholeRecordReader extends RecordReader<NullWritable, BytesWritable>{
    
    

	private Configuration configuration;
	private FileSplit split;
	
	private boolean processed = false;
	private BytesWritable value = new BytesWritable();
	
	@Override
	public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    
    
		
		this.split = (FileSplit)split;
		configuration = context.getConfiguration();
	}

	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
    
    
		
		if (!processed) {
    
    
			// 1 定义缓存区
			byte[] contents = new byte[(int)split.getLength()];
			
			FileSystem fs = null;
			FSDataInputStream fis = null;
			
			try {
    
    
				// 2 获取文件系统
				Path path = split.getPath();
				fs = path.getFileSystem(configuration);
				
				// 3 读取数据
				fis = fs.open(path);
				
				// 4 读取文件内容
				IOUtils.readFully(fis, contents, 0, contents.length);
				
				// 5 输出文件内容
				value.set(contents, 0, contents.length);
			} catch (Exception e) {
    
    
				
			}finally {
    
    
				IOUtils.closeStream(fis);
			}
			
			processed = true;
			
			return true;
		}
		
		return false;
	}

	@Override
	public NullWritable getCurrentKey() throws IOException, InterruptedException {
    
    
		return NullWritable.get();
	}

	@Override
	public BytesWritable getCurrentValue() throws IOException, InterruptedException {
    
    
		return value;
	}

	@Override
	public float getProgress() throws IOException, InterruptedException {
    
    
		return processed? 1:0;
	}

	@Override
	public void close() throws IOException {
    
    
	}
}

SequenceFileMapper处理流程

import java.io.IOException;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class SequenceFileReducer extends Reducer<Text, BytesWritable, Text, BytesWritable> {
    
    

	@Override
	protected void reduce(Text key, Iterable<BytesWritable> values, Context context)
			throws IOException, InterruptedException {
    
    

		context.write(key, values.iterator().next());
	}
}

SequenceFileDriver处理流程

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

public class SequenceFileDriver {
    
    

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    
    
		
		args = new String[] {
    
     "e:/input/inputinputformat", "e:/output1" };
		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);
		job.setJarByClass(SequenceFileDriver.class);
		job.setMapperClass(SequenceFileMapper.class);
		job.setReducerClass(SequenceFileReducer.class);

        // 设置输入的inputFormat
		job.setInputFormatClass(WholeFileInputformat.class);
        // 设置输出的outputFormat
		job.setOutputFormatClass(SequenceFileOutputFormat.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(BytesWritable.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(BytesWritable.class);

		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		boolean result = job.waitForCompletion(true);

		System.exit(result ? 0 : 1);
	}
}

猜你喜欢

转载自blog.csdn.net/qq_45092505/article/details/105511106