1、reduce端join算法实现
1.txt:
1001,20150710,p0001,2
1002,20150710,p0002,3
1002,20150710,p0003,3
2.txt
p0001,小米5,1000,2000
p0002,锤子T1,1000,3000
通过将关联的条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联
第一步:定义OrderBean
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* @author kismet
* @date 2019-11-18 17:10
*/
public class JoinBean implements Writable {
private String id;
private String date;
private String pid;
private String amount;
private String pname;
private String category;
private String price;
@Override
public String toString() {
return "JoinBean{" +
"id='" + id + '\'' +
", date='" + date + '\'' +
", pid='" + pid + '\'' +
", amount='" + amount + '\'' +
", pname='" + pname + '\'' +
", category='" + category + '\'' +
", price='" + price + '\'' +
'}';
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public String getAmount() {
return amount;
}
public void setAmount(String amount) {
this.amount = amount;
}
public String getPname() {
return pname;
}
public void setPname(String pname) {
this.pname = pname;
}
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
public String getPrice() {
return price;
}
public void setPrice(String price) {
this.price = price;
}
public JoinBean() {
}
public JoinBean(String id, String date, String pid, String amount, String pname, String category, String price) {
this.id = id;
this.date = date;
this.pid = pid;
this.amount = amount;
this.pname = pname;
this.category = category;
this.price = price;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(id+"");
out.writeUTF(date+"");
out.writeUTF(pid+"");
out.writeUTF(amount+"");
out.writeUTF(pname+"");
out.writeUTF(category+"");
out.writeUTF(price+"");
}
@Override
public void readFields(DataInput in) throws IOException {
this.id=in.readUTF();
this.date=in.readUTF();
this.pid=in.readUTF();
this.amount=in.readUTF();
this.pname=in.readUTF();
this.category=in.readUTF();
this.price=in.readUTF();
}
}
第二步:定义map类
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* @author kismet
*/
public class WordCountMap extends Mapper<LongWritable, Text, Text, JoinBean> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
JoinBean jb=new JoinBean();
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String name = inputSplit.getPath().getName();
String s = value.toString();
String[] split = s.split(",");
if (name.contains("orders")){
jb.setId(split[0]);
jb.setAmount(split[3]);
jb.setDate(split[1]);
jb.setPid(split[2]);
context.write(new Text(split[2]),jb);
}else {
jb.setPname(split[1]);
jb.setPrice(split[2]);
jb.setCategory(split[3]);
context.write(new Text(split[0]),jb);
}
}
}
第三步:自定义reduce类
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @author kismet
*/
public class WordCountReduce extends Reducer<Text, JoinBean, Text, JoinBean> {
@Override
protected void reduce(Text key, Iterable<JoinBean> values, Context context) throws IOException, InterruptedException {
JoinBean joinBean = new JoinBean();
for (JoinBean value : values) {
if (null !=value.getId() && !value.getId().equals("null")){
joinBean.setId(value.getId());
joinBean.setAmount(value.getAmount());
joinBean.setDate(value.getDate());
joinBean.setPid(value.getPid());
}else {
joinBean.setPname(value.getPname());
joinBean.setPrice(value.getPrice());
joinBean.setCategory(value.getCategory());
}
}
context.write(new Text(""),joinBean);
}
}
第四步:开发main方法入口
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* @author kismet
*/
public class WordCountDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(new Configuration(), "www");
job.setJarByClass(JoinBean.class);
job.setJarByClass(WordCountDriver.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job, new Path("E:\\第三学期\\第二阶段\\day22\\4\\map端join\\input"));
job.setMapperClass(WordCountMap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(JoinBean.class);
job.setReducerClass(WordCountReduce.class);
job.setOutputKeyClass(Text.class);
job.setMapOutputValueClass(JoinBean.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job, new Path("E:\\第三学期\\第二阶段\\day22\\4\\map端join\\input\\aa"));
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new WordCountDriver(), args);
}
}
2 map端join算法实现
适用于关联表中有小表的情形;
可以将小表分发到所有的map节点,这样,map节点就可以在本地对自己所读到的大表数据进行join并输出最终结果,可以大大提高join操作的并发度,加快处理速度
第一步:定义mapJoin
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
/**
* @author kismet
*/
public class WordCountMap extends Mapper<LongWritable, Text, Text, Text> {
HashMap<String,String> b_tab = new HashMap<String, String>();
String line = null;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
URI[] cacheFiles = DistributedCache.getCacheFiles(context.getConfiguration());
FileSystem fileSystem = FileSystem.get(cacheFiles[0], context.getConfiguration());
FSDataInputStream open = fileSystem.open(new Path(cacheFiles[0]));
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(open));
while ((line = bufferedReader.readLine())!=null){
String[] split = line.split(",");
b_tab.put(split[0],split[1]+"\t"+split[2]+"\t"+split[3]);
}
fileSystem.close();
IOUtils.closeStream(bufferedReader);
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split(",");
String s = b_tab.get(split[2]);
context.write(new Text(s),new Text(split[0]+"\t"+split[1]+"\t"+split[3]));
}
}
第二步:定义程序运行main方法
import com.czxy.demo6.WordCountDriver;
import com.czxy.demo6.WordCountMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
/**
* @author kismet
* @date 2019-11-18 18:08
*/
public class MapjoinDrive extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration configuration = new Configuration();
DistributedCache.addCacheFile(new URI("/aaa/pdts.txt"),configuration);
Job job = Job.getInstance(configuration, "www");
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job, new Path("E:\\第三学期\\第二阶段\\day22\\4\\map端join\\input"));
job.setMapperClass(WordCountMap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job, new Path("E:\\第三学期\\第二阶段\\day22\\4\\map端join\\input\\bb"));
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new WordCountDriver(), args);
}
}