MapReduce(6)--MapReduce reduce端join与map端join算法实现

1、reduce端join算法实现

1.txt:

1001,20150710,p0001,2
1002,20150710,p0002,3
1002,20150710,p0003,3

2.txt

p0001,小米5,1000,2000
p0002,锤子T1,1000,3000

通过将关联的条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联

第一步:定义OrderBean

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * @author kismet
 * @date 2019-11-18 17:10
 */
public class JoinBean implements Writable {
    private String id;
    private String date;
    private String pid;
    private String amount;
    private String pname;
    private String category;
    private String price;

    @Override
    public String toString() {
        return "JoinBean{" +
                "id='" + id + '\'' +
                ", date='" + date + '\'' +
                ", pid='" + pid + '\'' +
                ", amount='" + amount + '\'' +
                ", pname='" + pname + '\'' +
                ", category='" + category + '\'' +
                ", price='" + price + '\'' +
                '}';
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getDate() {
        return date;
    }

    public void setDate(String date) {
        this.date = date;
    }

    public String getPid() {
        return pid;
    }

    public void setPid(String pid) {
        this.pid = pid;
    }

    public String getAmount() {
        return amount;
    }

    public void setAmount(String amount) {
        this.amount = amount;
    }

    public String getPname() {
        return pname;
    }

    public void setPname(String pname) {
        this.pname = pname;
    }

    public String getCategory() {
        return category;
    }

    public void setCategory(String category) {
        this.category = category;
    }

    public String getPrice() {
        return price;
    }

    public void setPrice(String price) {
        this.price = price;
    }

    public JoinBean() {
    }

    public JoinBean(String id, String date, String pid, String amount, String pname, String category, String price) {
        this.id = id;
        this.date = date;
        this.pid = pid;
        this.amount = amount;
        this.pname = pname;
        this.category = category;
        this.price = price;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(id+"");
        out.writeUTF(date+"");
        out.writeUTF(pid+"");
        out.writeUTF(amount+"");
        out.writeUTF(pname+"");
        out.writeUTF(category+"");
        out.writeUTF(price+"");
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.id=in.readUTF();
        this.date=in.readUTF();
        this.pid=in.readUTF();
        this.amount=in.readUTF();
        this.pname=in.readUTF();
        this.category=in.readUTF();
        this.price=in.readUTF();
    }
}

第二步:定义map类

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

/**
 * @author kismet
 */
public class WordCountMap extends Mapper<LongWritable, Text, Text, JoinBean> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        JoinBean jb=new JoinBean();
        FileSplit inputSplit = (FileSplit) context.getInputSplit();
        String name = inputSplit.getPath().getName();
        String s = value.toString();
        String[] split = s.split(",");
        if (name.contains("orders")){
            jb.setId(split[0]);
            jb.setAmount(split[3]);
            jb.setDate(split[1]);
            jb.setPid(split[2]);
           context.write(new Text(split[2]),jb);
        }else {
            jb.setPname(split[1]);
            jb.setPrice(split[2]);
            jb.setCategory(split[3]);
            context.write(new Text(split[0]),jb);
        }

    }
}

第三步:自定义reduce类

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @author kismet
 */
public class WordCountReduce extends Reducer<Text, JoinBean, Text, JoinBean> {

    @Override
    protected void reduce(Text key, Iterable<JoinBean> values, Context context) throws IOException, InterruptedException {
        JoinBean joinBean = new JoinBean();
        for (JoinBean value : values) {
            if (null !=value.getId() && !value.getId().equals("null")){
                joinBean.setId(value.getId());
                joinBean.setAmount(value.getAmount());
                joinBean.setDate(value.getDate());
                joinBean.setPid(value.getPid());
            }else {
                joinBean.setPname(value.getPname());
                joinBean.setPrice(value.getPrice());
                joinBean.setCategory(value.getCategory());
            }
        }
        context.write(new Text(""),joinBean);
    }
}

第四步开发main方法入口

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


/**
 * @author kismet
 */
public class WordCountDriver extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(new Configuration(), "www");

        job.setJarByClass(JoinBean.class);
        job.setJarByClass(WordCountDriver.class);

        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job, new Path("E:\\第三学期\\第二阶段\\day22\\4\\map端join\\input"));

        job.setMapperClass(WordCountMap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(JoinBean.class);

        job.setReducerClass(WordCountReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setMapOutputValueClass(JoinBean.class);

        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, new Path("E:\\第三学期\\第二阶段\\day22\\4\\map端join\\input\\aa"));

        return job.waitForCompletion(true) ? 0 : 1;

    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new WordCountDriver(), args);
    }
}

 

2 map端join算法实现

适用于关联表中有小表的情形;

可以将小表分发到所有的map节点,这样,map节点就可以在本地对自己所读到的大表数据进行join并输出最终结果,可以大大提高join操作的并发度,加快处理速度

第一步:定义mapJoin

import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;

/**
 * @author kismet
 */
public class WordCountMap extends Mapper<LongWritable, Text, Text, Text> {
    HashMap<String,String> b_tab = new HashMap<String, String>();
    String line = null;
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        URI[] cacheFiles = DistributedCache.getCacheFiles(context.getConfiguration());
        FileSystem fileSystem = FileSystem.get(cacheFiles[0], context.getConfiguration());
        FSDataInputStream open = fileSystem.open(new Path(cacheFiles[0]));
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(open));
        while ((line = bufferedReader.readLine())!=null){
            String[] split = line.split(",");
            b_tab.put(split[0],split[1]+"\t"+split[2]+"\t"+split[3]);
        }
        fileSystem.close();
        IOUtils.closeStream(bufferedReader);
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] split = value.toString().split(",");
        String s = b_tab.get(split[2]);
        context.write(new Text(s),new Text(split[0]+"\t"+split[1]+"\t"+split[3]));
    }
}

第二步:定义程序运行main方法


import com.czxy.demo6.WordCountDriver;
import com.czxy.demo6.WordCountMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.net.URI;

/**
 * @author kismet
 * @date 2019-11-18 18:08
 */
public class MapjoinDrive extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        DistributedCache.addCacheFile(new URI("/aaa/pdts.txt"),configuration);

        Job job = Job.getInstance(configuration, "www");


        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job, new Path("E:\\第三学期\\第二阶段\\day22\\4\\map端join\\input"));

        job.setMapperClass(WordCountMap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);


        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, new Path("E:\\第三学期\\第二阶段\\day22\\4\\map端join\\input\\bb"));

        return job.waitForCompletion(true) ? 0 : 1;

    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new WordCountDriver(), args);
    }
}
发布了80 篇原创文章 · 获赞 168 · 访问量 8万+

猜你喜欢

转载自blog.csdn.net/weixin_44036154/article/details/103133101