MapReduce-join连接

join连接

MapReduce能够执行大型数据集间的连接（join）操作。连接操作的具体实现技术取决于数据集的规模及分区方式
连接操作如果由mapper执行，则称为“map端连接”；如果由reducer执行，则称为“reduce端连接”。

Map端连接

在两个大规模输入数据集之间的map端连接会在数据到达map函数之前就执行连接操作。为达到该目的，各map的输入数据必须先分区并且以特定方式排序。各个输入数据集被划分成相同数量的分区，并且均按相同的键（连接键）排序。同一键的所有记录均会放在同一分区之中。
Map端连接操作可以连接多个作业的输出，只要这些作业的reducer数量相同、键相同并且输出文件是不可切分的（例如，小于一个HDFS块，或gzip压缩）。

Reduce端连接

由于reduce端连接并不要求输入数据集符合特定结构，因而reduce端连接比map端连接更为常用。但是，由于两个数据集均需经过MapReduce的shuffle过程，所以reduce端连接的效率往往要低一些。基本思路是mapper为各个记录标记源，并且使用连接件作为map输出键，使键相同的记录放在同一reducer中。
需要使用以下技术

1.多输入

数据集的输入源往往有多中格式，因此可以使用MultipleInputs类来方便地解析和标注各个源。

2.辅助排序

reducer将从两个源中选出键相同的记录且并不介意这些记录是否已排好序。此外，为了更好的执行连接操作，先将某一个源的数据传输到reducer会非常重要。

举个例子

现有气象站文件及气象数据文件，需要将两个文件进行关联

气象站文件内容如下

00001,北京
00002,天津
00003,山东

气象数据文件内容如下

00001,20180101,15
00001,20180102,16
00002,20180101,25
00002,20180102,26
00003,20180101,35
00003,20180102,36

　要求：输出气象站ID 气象站名称及气象数据

代码如下

1.JoinRecordWithStationName类

package com.zhen.mapreduce.join;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;



/**
 * @author FengZhen
 * @date 2018年9月16日
 * 
 */
public class JoinRecordWithStationName extends Configured implements Tool{

	/**
	 * 在reduce端连接中，标记气象站记录的mapper
	 * @author FengZhen
	 *	00001,北京
		00002,天津
		00003,山东
	 */
	static class JoinStationMapper extends Mapper<LongWritable, Text, TextPair, Text>{
		private NcdcStationMetadataParser parser = new NcdcStationMetadataParser();
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, TextPair, Text>.Context context)
				throws IOException, InterruptedException {
			if (parser.parse(value.toString())) {
				context.write(new TextPair(parser.getStationId(), "0"), new Text(parser.getStationName()));
			}
		}
	}
	
	/**
	 * 在reduce端连接中标记天气记录的mapper
	 * @author FengZhen
	 *	00001,20180101,15
		00001,20180102,16
		00002,20180101,25
		00002,20180102,26
		00003,20180101,35
		00003,20180102,36
	 */
	static class JoinRecordMapper extends Mapper<LongWritable, Text, TextPair, Text> {
		private NcdcRecordParser parser = new NcdcRecordParser();
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, TextPair, Text>.Context context)
				throws IOException, InterruptedException {
			parser.parse(value.toString());
			context.write(new TextPair(parser.getStationId(), "1"), value);
		}
	}
	
	/**
	 * reducer知道自己会先接收气象站记录。因此从中抽取出值，并将其作为后续每条输出记录的一部分写到输出文件。
	 * @author FengZhen
	 *
	 */
	static class JoinReducer extends Reducer<TextPair, Text, Text, Text> {
		@Override
		protected void reduce(TextPair key, Iterable<Text> values, Reducer<TextPair, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			Iterator<Text> iterator = values.iterator();
			//取气象站名
			Text stationName = new Text(iterator.next());
			while (iterator.hasNext()) {
				Text record = iterator.next();
				Text outValue = new Text(stationName.toString() + "\t" + record.toString());
				context.write(key.getFirst(), outValue);
			}
		}
	}
	
	static class KeyPartitioner extends Partitioner<TextPair, Text>{
		@Override
		public int getPartition(TextPair key, Text value, int numPartitions) {
			return (key.getFirst().hashCode() & Integer.MAX_VALUE) % numPartitions;
		}
	}
	
	public int run(String[] args) throws Exception {
		Job job = Job.getInstance(getConf());
		job.setJobName("JoinRecordWithStationName");
		job.setJarByClass(JoinRecordWithStationName.class);

		Path ncdcInputPath = new Path(args[0]);
		Path stationInputPath = new Path(args[1]);
		Path outputPath = new Path(args[2]);
		
		MultipleInputs.addInputPath(job, ncdcInputPath, TextInputFormat.class, JoinRecordMapper.class);
		MultipleInputs.addInputPath(job, stationInputPath, TextInputFormat.class, JoinStationMapper.class);
		FileOutputFormat.setOutputPath(job, outputPath);
		
		job.setPartitionerClass(KeyPartitioner.class);
		job.setGroupingComparatorClass(TextPair.FirstComparator.class);
		
		job.setMapOutputKeyClass(TextPair.class);
		
		job.setReducerClass(JoinReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		return job.waitForCompletion(true) ? 0 : 1;
	}

	public static void main(String[] args)  {
		String[] params = new String[] {
				"hdfs://fz/user/hdfs/MapReduce/data/join/JoinRecordWithStationName/input/record",
				"hdfs://fz/user/hdfs/MapReduce/data/join/JoinRecordWithStationName/input/station",
				"hdfs://fz/user/hdfs/MapReduce/data/join/JoinRecordWithStationName/output"};
		int exitCode = 0;
		try {
			exitCode = ToolRunner.run(new JoinRecordWithStationName(), params);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		System.exit(exitCode);
	}
	
}

2.NcdcRecordParser类

package com.zhen.mapreduce.join;

import java.io.Serializable;

/**
 * @author FengZhen
 * @date 2018年9月9日
 * 解析天气数据
 */
public class NcdcRecordParser implements Serializable{

	private static final long serialVersionUID = 1L;

	/**
	 * 气象台ID
	 */
	private String stationId;
	/**
	 * 时间
	 */
	private long timeStamp;
	/**
	 * 气温
	 */
	private Integer temperature;
	
	/**
	 * 解析
	 * @param value
	 */
	public void parse(String value) {
		String[] values = value.split(",");
		if (values.length >= 3) {
			stationId = values[0];
			timeStamp = Long.parseLong(values[1]);
			temperature = Integer.valueOf(values[2]);
		}
	}
	
	/**
	 * 校验是否合格
	 * @return
	 */
	public boolean isValidTemperature() {
		return null != temperature;
	}

	public String getStationId() {
		return stationId;
	}

	public void setStationId(String stationId) {
		this.stationId = stationId;
	}

	public long getTimeStamp() {
		return timeStamp;
	}

	public void setTimeStamp(long timeStamp) {
		this.timeStamp = timeStamp;
	}

	public Integer getTemperature() {
		return temperature;
	}

	public void setTemperature(Integer temperature) {
		this.temperature = temperature;
	}
	
}

　3.NcdcStationMetadataParser类

package com.zhen.mapreduce.join;

import java.io.Serializable;

/**
 * @author FengZhen
 * @date 2018年9月9日
 * 解析气象台数据
 */
public class NcdcStationMetadataParser implements Serializable{

	private static final long serialVersionUID = 1L;

	/**
	 * 气象台ID
	 */
	private String stationId;
	/**
	 * 气象台名称
	 */
	private String stationName;
	
	/**
	 * 解析
	 * @param value
	 */
	public boolean parse(String value) {
		String[] values = value.split(",");
		if (values.length >= 2) {
			stationId = values[0];
			stationName = values[1];
			return true;
		}
		return false;
	}

	public String getStationId() {
		return stationId;
	}

	public void setStationId(String stationId) {
		this.stationId = stationId;
	}

	public String getStationName() {
		return stationName;
	}

	public void setStationName(String stationName) {
		this.stationName = stationName;
	}
}

　4.TextPair类

package com.zhen.mapreduce.join;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * @author FengZhen
 * @date 2018年9月16日
 * 
 */
public class TextPair implements WritableComparable<TextPair>{

	private Text first;
	private Text second;
	public TextPair() {
		set(new Text(), new Text());
	}
	public TextPair(String first, String second) {
		set(new Text(first), new Text(second));
	}
	public TextPair(Text first, Text second) {
		set(first, second);
	}
	public void set(Text first, Text second) {
		this.first = first;
		this.second = second;
	}
	
	public void write(DataOutput out) throws IOException {
		first.write(out);
		second.write(out);
	}

	public void readFields(DataInput in) throws IOException {
		first.readFields(in);
		second.readFields(in);
	}

	@Override
	public int hashCode() {
		return first.hashCode() * 163 + second.hashCode();
	}
	
	@Override
	public boolean equals(Object obj) {
		if (obj instanceof TextPair) {
			TextPair textPair = (TextPair) obj;
			return first.equals(textPair.first) && second.equals(textPair.second);
		}
		return false;
	}
	
	public int compareTo(TextPair o) {
		int cmp = first.compareTo(o.first);
		if (cmp != 0) {
			return cmp;
		}
		return second.compareTo(o.second);
	}
	
	public Text getFirst() {
		return first;
	}
	public void setFirst(Text first) {
		this.first = first;
	}
	public Text getSecond() {
		return second;
	}
	public void setSecond(Text second) {
		this.second = second;
	}
	@Override
	public String toString() {
		return first + "\t" + second;
	}
	
	/**
	 * 比较两个int值大小
	 * 降序
	 * @param a
	 * @param b
	 * @return
	 */
	public static int compare(Text a, Text b) {
		return a.compareTo(b);
	}
	
	static class FirstComparator extends WritableComparator{
		protected FirstComparator() {
			super(TextPair.class, true);
		}
		@Override
		public int compare(WritableComparable a, WritableComparable b) {
			TextPair ip1 = (TextPair) a;
			TextPair ip2 = (TextPair) b;
			return TextPair.compare(ip1.getFirst(), ip2.getFirst());
		}
	}
	
}

　打jar包，上传并执行

scp /Users/FengZhen/Desktop/Hadoop/file/JoinRecordWithStationName.jar [email protected]:/usr/local/test/mr
hadoop jar JoinRecordWithStationName.jar com.zhen.mapreduce.join.JoinRecordWithStationName

　结果如下

00001	北京	00001,20180102,16
00001	北京	00001,20180101,15
00002	天津	00002,20180102,26
00002	天津	00002,20180101,25
00003	山东	00003,20180102,36
00003	山东	00003,20180101,35