结合案例讲解MapReduce重要知识点 --------- 多表连接

版权声明:个人原创,转载请标注! https://blog.csdn.net/Z_Date/article/details/83926827

第一张表的内容:

login:
uid	sexid	logindate
1	1	2017-04-17 08:16:20
2   2	2017-04-15 06:18:20
3   1	2017-04-16 05:16:24
4   2	2017-04-14 03:18:20
5   1	2017-04-13 02:16:25
6   2	2017-04-13 01:15:20
7   1	2017-04-12 08:16:34
8   2	2017-04-11 09:16:20
9   0	2017-04-10 05:16:50

第二张表的内容:

sex:
0	不知道
1	男
2	女

第三张表的内容:

user uname
1	小红
2   小行
3   小通
4   小闪
5   小镇
6   小振
7   小秀
8   小微
9   小懂
10	小明
11  小刚
12  小举
13  小黑
14  小白
15  小鹏
16  小习

最终输出效果:

loginuid	 sex		uname	logindate
1		男	            小红	 2017-04-17 08:16:20
2        女	 			小行	  2017-04-15 06:18:20
3        男	 			小通	  2017-04-16 05:16:24
4        女	 			小闪	  2017-04-14 03:18:20
5        男	 			小镇	  2017-04-13 02:16:25
6        女	 			小振	  2017-04-13 01:15:20
7        男	 			小秀	  2017-04-12 08:16:34
9       不知道			   小微	2017-04-10 05:16:50
8       女	 			小懂	  2017-04-11 09:16:20


思路:

map端join:map端join

核心思想:将小表文件缓存到分布式缓存中,然后再map端进行连接处理。

适用场景:有一个或者多个小表 和 一个或者多个大表文件。

优点:map端使用内存缓存小表数据,加载速度快;大大减少map端到reduce端的传输量;大大较少shuffle过程耗时。

缺点:解决的业务需要有小表。

semi join:半连接

解决map端的缺点,当多个大文件同时存在,且一个大文件中有效数据抽取出来是小文件时,

则可以单独抽取出来并缓存到分布式缓存中,然后再使用map端join来进行连接。

自定义一个writable类User

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;

/**
 * user 信息bean
 * @author lyd
 *
 */
public class User implements Writable{

	public String uid;
	public String uname;
	public String gender;
	public String ldt;
	
	public User(){
		
	}
	
	public User(String uid, String uname, String gender, String ldt) {
		this.uid = uid;
		this.uname = uname;
		this.gender = gender;
		this.ldt = ldt;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(uid);
		out.writeUTF(uname);
		out.writeUTF(gender);
		out.writeUTF(ldt);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.uid = in.readUTF();
		this.uname = in.readUTF();
		this.gender = in.readUTF();
		this.ldt = in.readUTF();
	}

	/**
	 * @return the uid
	 */
	public String getUid() {
		return uid;
	}

	/**
	 * @param uid the uid to set
	 */
	public void setUid(String uid) {
		this.uid = uid;
	}

	/**
	 * @return the uname
	 */
	public String getUname() {
		return uname;
	}

	/**
	 * @param uname the uname to set
	 */
	public void setUname(String uname) {
		this.uname = uname;
	}

	/**
	 * @return the gender
	 */
	public String getGender() {
		return gender;
	}

	/**
	 * @param gender the gender to set
	 */
	public void setGender(String gender) {
		this.gender = gender;
	}

	/**
	 * @return the ldt
	 */
	public String getLdt() {
		return ldt;
	}

	/**
	 * @param ldt the ldt to set
	 */
	public void setLdt(String ldt) {
		this.ldt = ldt;
	}

	/* (non-Javadoc)
	 * @see java.lang.Object#toString()
	 */
	@Override
	public String toString() {
		return uid + "\t" + uname + "\t" + gender + "\t" + ldt;
	}
}

MapReduce类MultipleTableJoin

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MultipleTableJoin extends ToolRunner implements Tool{

	/**
	 * 自定义的myMapper
	 * @author lyd
	 *
	 */
	static class MyMapper extends Mapper<LongWritable, Text, User, NullWritable>{

		Map<String,String> sexMap = new ConcurrentHashMap<String, String>();
		Map<String,String> userMap = new ConcurrentHashMap<String, String>();
		
		//读取缓存文件
		@Override
		protected void setup(Context context)throws IOException, InterruptedException {
			Path [] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
			for (Path p : paths) {
				String fileName = p.getName();
				if(fileName.equals("sex")){//读取 “性别表”
					BufferedReader sb = new BufferedReader(new FileReader(new File(p.toString())));
					String str = null;
					while((str = sb.readLine()) != null){
						String []  strs = str.split("\t");
						sexMap.put(strs[0], strs[1]);
					}
					sb.close();
				} else if(fileName.equals("user")){//读取“用户表”
					BufferedReader sb = new BufferedReader(new FileReader(new File(p.toString())));
					String str = null;
					while((str = sb.readLine()) != null){
						String []  strs = str.split("\t");
						userMap.put(strs[0], strs[1]);
					}
					sb.close();
				}
			}
		}

		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {
			
			String line = value.toString();
			String lines [] = line.split("\t");
			String uid = lines[0];
			String sexid = lines[1];
			String logindate = lines[2];
			
			//join连接操作
			if(sexMap.containsKey(sexid) && userMap.containsKey(uid)){
				String uname = userMap.get(uid);
				String gender = sexMap.get(sexid);
				//User user = new User(uid, uname, gender, logindate);
				//context.write(new Text(uid+"\t"+uname+"\t"+gender+"\t"+logindate), NullWritable.get());
				User user = new User(uid, uname, gender, logindate);
				context.write(user, NullWritable.get());
			}	
		}

		@Override
		protected void cleanup(Context context)throws IOException, InterruptedException {
		}
	}
	
	/**
	 * 自定义MyReducer
	 * @author lyd
	 *
	 */
	/*static class MyReducer extends Reducer<Text, Text, Text, Text>{

		@Override
		protected void setup(Context context)throws IOException, InterruptedException {
		}
		
		@Override
		protected void reduce(Text key, Iterable<Text> value,Context context)
				throws IOException, InterruptedException {
		}
		
		@Override
		protected void cleanup(Context context)throws IOException, InterruptedException {
		}
	}*/
	
	
	@Override
	public void setConf(Configuration conf) {
		conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
	}

	@Override
	public Configuration getConf() {
		return new Configuration();
	}
	
	/**
	 * 驱动方法
	 */
	@Override
	public int run(String[] args) throws Exception {
		//1、获取conf对象
		Configuration conf = getConf();
		//2、创建job
		Job job = Job.getInstance(conf, "model01");
		//3、设置运行job的class
		job.setJarByClass(MultipleTableJoin.class);
		//4、设置map相关属性
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(User.class);
		job.setMapOutputValueClass(NullWritable.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		
		//设置缓存文件  
		job.addCacheFile(new URI(args[2]));
		job.addCacheFile(new URI(args[3]));
		
//		URI [] uris = {new URI(args[2]),new URI(args[3])};
//		job.setCacheFiles(uris);
		
	/*	DistributedCache.addCacheFile(new URI(args[2]), conf);
		DistributedCache.addCacheFile(new URI(args[3]), conf);*/
		
		/*//5、设置reduce相关属性
		job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);*/
		//判断输出目录是否存在,若存在则删除
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(new Path(args[1]))){
			fs.delete(new Path(args[1]), true);
		}
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//6、提交运行job
		int isok = job.waitForCompletion(true) ? 0 : 1;
		return isok;
	}
	
	/**
	 * job的主入口
	 * @param args
	 */
	public static void main(String[] args) {
		try {
			//对输入参数作解析
			String [] argss = new GenericOptionsParser(new Configuration(), args).getRemainingArgs();
			System.exit(ToolRunner.run(new MultipleTableJoin(), argss));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}

猜你喜欢

转载自blog.csdn.net/Z_Date/article/details/83926827