MapReduce编程小案例.9th—join算法
数据:
有订单数据:
order001,u001 order002,u001 order003,u005 order004,u002 order005,u003 order006,u004 |
有用户数据:
u001,senge,18,angelababy u002,laozhao,48,ruhua u003,xiaoxu,16,chunge u004,laoyang,28,zengge u005,nana,14,huangbo |
需求:要求把它们username相同的整合起来
思路:
map端:
不管worker读到的是什么文件,我们的map方法中是可以通过context来区分的
对于order数据,map中切字段,封装为一个joinbean,打标记:t_order
对于user数据,map中切字段,封装为一个joinbean,打标记:t_user
然后,以uid作为key,以joinbean作为value返回
reduce端:
用迭代器迭代出一组相同uid的所有数据joinbean,然后判断
如果是标记字段为t_order的,则加入一个arraylist<JoinBean>中
如果标记字段为t_user的,则放入一个Joinbean对象中
然后,遍历arraylist,对里面的每一个JoinBean填充userBean中的user数据,然后输出这个joinBean即可
实现代码:
JoinBean实现类
package cn.edu360.mr.join; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.Writable; public class JoinBean implements Writable{ private String orderId; private String userId; private String userName; private int userAge; private String userFriend; private String tableName; public void set(String orderId, String userId, String userName, int userAge, String userFriend,String tableName) { this.orderId = orderId; this.userId = userId; this.userName = userName; this.userAge = userAge; this.userFriend = userFriend; this.tableName = tableName; } public String getTableName() { return tableName; } public void setTableName(String tableName) { this.tableName = tableName; } public String getOrderId() { return orderId; } public void setOrderId(String orderId) { this.orderId = orderId; } public String getUserId() { return userId; } public void setUserId(String userId) { this.userId = userId; } public String getUserName() { return userName; } public void setUserName(String userName) { this.userName = userName; } public int getUserAge() { return userAge; } public void setUserAge(int userAge) { this.userAge = userAge; } public String getUserFriend() { return userFriend; } public void setUserFriend(String userFriend) { this.userFriend = userFriend; } @Override public String toString() { return this.orderId +","+ this.userId + ","+this.userAge + ","+this.userName + ","+this.userFriend; } public void readFields(DataInput in) throws IOException { this.orderId = in.readUTF(); this.userId = in.readUTF(); this.userName = in.readUTF(); this.userAge = in.readInt(); this.userFriend = in.readUTF(); this.tableName = in.readUTF(); } public void write(DataOutput out) throws IOException { out.writeUTF(this.orderId); out.writeUTF(this.userId); out.writeUTF(this.userName); out.writeInt(this.userAge); out.writeUTF(this.userFriend); out.writeUTF(this.tableName); } }
ReduceSideJoin实现类
package cn.edu360.mr.join; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import org.apache.commons.beanutils.BeanUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /* * 本例是使用最low的方式实现 * * 还可以利用Partitioner + CompareTo + GroupingComparator 高效实现 */ public class ReduceSideJoin { public static class ReduceSideJoinMapper extends Mapper<LongWritable, Text, Text, JoinBean>{ String fileName = null; JoinBean bean = new JoinBean(); Text k = new Text(); /* * maptask在做数据处理的时候,会先执行一次setup() * 执行完之后才对每一行反复调用map() */ @Override protected void setup(Mapper<LongWritable, Text, Text, JoinBean>.Context context) throws IOException, InterruptedException { FileSplit inputSplit =(FileSplit)context.getInputSplit(); fileName = inputSplit.getPath().getName(); } @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, JoinBean>.Context context) throws IOException, InterruptedException { String[] fields = value.toString().split(","); if(fileName.startsWith("order")) { bean.set(fields[0],fields[1],"NULL",-1,"NULL","order"); }else { bean.set("NULL", fields[0], fields[1], Integer.parseInt(fields[2]), fields[3], "user"); } k.set(bean.getUserId()); context.write(k, bean); } } public static class ReduceSideJoinReducer extends Reducer<Text, JoinBean, JoinBean, NullWritable>{ @Override protected void reduce(Text key, Iterable<JoinBean> beans, Reducer<Text, JoinBean, JoinBean, NullWritable>.Context context) throws IOException, InterruptedException { ArrayList<JoinBean> orderList = new ArrayList<JoinBean>(); JoinBean userBean = null; try { for (JoinBean bean : beans) { if("order".equals(bean.getTableName())) { JoinBean newBean = new JoinBean(); BeanUtils.copyProperties(newBean, bean); orderList.add(newBean); } else { userBean =new JoinBean(); BeanUtils.copyProperties(userBean, bean); } } //拼接数据 for (JoinBean bean : orderList) { bean.setUserName(userBean.getUserName()); bean.setUserAge(userBean.getUserAge()); bean.setUserFriend(userBean.getUserFriend()); context.write(bean, NullWritable.get()); } }catch (Exception e) { e.printStackTrace(); } } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.setInt("order.top.n", 2); Job job = Job.getInstance(conf); job.setJarByClass(ReduceSideJoin.class); job.setMapperClass(ReduceSideJoinMapper.class); job.setReducerClass(ReduceSideJoinReducer.class); job.setNumReduceTasks(2); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(JoinBean.class); job.setOutputKeyClass(JoinBean.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\join\\input")); FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\join\\out1")); job.waitForCompletion(true); } }PS:以上代码是最low的,耗费内存太大了;