MapReduce实现单表关联和多表关联

单表关联:
给出child-parent(孩子——父母)表,
要求输出grandchild-grandparent(孙子——爷奶)表

输入:
文件p中数据:
Tom,Lucy
Tom,Jack
Jone,Lucy
Jone,Jack
Lucy,Mary
Lucy,Ben
Jack,Alice
Jack,Jesse
Terry,Alice
Terry,Jesse
Philip,Terry
Philip,Alma
Mark,Terry
Mark,Alma

输出:
Tom,Alice
Tom,Jesse
Jone,Alice
Jone,Jesse
Tom,Mary
Tom,Ben
Jone,Mary
Jone,Ben
Philip,Alice
Philip,Jesse
Mark,Alice
Mark,Jesse
实现方法:
1、将文件切分前后各作为key和value写一遍
2、方法同下面多表关联类似
实现代码:
package One_File_Relation;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class One_file_test {

    static String INPUT_PATH = "hdfs://master:9000/input/p";
    static String OUTPUT_PATH = "hdfs://master:9000/output";

    static class MyMapper extends Mapper<Object,Object,Text,Text>{  
        Text  output_key=new Text();
        Text  output_value=new Text();
    protected void map(Object key, Object value, Context context) throws IOException, InterruptedException{

        String[]  tokens=value.toString().split(",");

        if(tokens!=null && tokens.length==2){
        output_key.set(tokens[0].trim());
        output_value.set(2+","+value);
        context.write(output_key,output_value);

        output_value.set(1+","+value);
        output_key.set(tokens[1]);

        context.write(output_key,output_value);

        System.out.println(tokens[0]+"  -  "+tokens[1]);
        }   
    }   
}
static class MyReduce extends Reducer<Text,Text,Text,Text>{
    Text  output_key=new Text();
    Text  output_value=new Text();
     protected void reduce(Text key, Iterable<Text> values, Context context) 
             throws IOException, InterruptedException{
         List<String> childs=new ArrayList();
         List<String> grands=new ArrayList();

         for(Text line:values){
             String[] tokens=line.toString().split(",");
             if(tokens[0].equals("1")){
                 childs.add(tokens[1]);  
                 System.out.println(1+"=="+tokens[1]);
             }
             else if(tokens[0].equals("2")){
                 grands.add(tokens[2]);
                 System.out.println(2+"=="+tokens[2]);
             }
         }

         for(String c:childs)
             for(String g:grands){
                 output_key.set(c);
                 output_value.set(g);
                 context.write(output_key, output_value);  
             }
     }
}

    public static void main(String[]args) throws IOException, ClassNotFoundException, InterruptedException{

        Path outputpath = new Path(OUTPUT_PATH);
        Configuration conf = new Configuration();
        FileSystem fs = outputpath.getFileSystem(conf);

        if(fs.exists(outputpath)){
            fs.delete(outputpath,true);
        }
        conf.set("fs.default.name ", "hdfs://master:9000/");

        Job job = Job.getInstance(conf);

        FileInputFormat.setInputPaths(job, INPUT_PATH);
        FileOutputFormat.setOutputPath(job, outputpath);

        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReduce.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.waitForCompletion(true);
    }

}

多表关联:
输入:

num1文件
xm@master:~$ hadoop fs -text /b/num1
1,Beijing
2,Guangzhou
3,Shenzhen
4,Xian

num2文件:
xm@master:~$ hadoop fs -text /b/num2
Beijing Red Star,1
Shenzhen Thunder,3
Guangzhou Honda,2
Beijing Rising,1
Guangzhou Development Bank,2
Tencent,3
Back of Beijing,1
输出:
Back of Beijing Beijing
Beijing Rising  Beijing
Beijing Red Star    Beijing
Guangzhou Development Bank  Guangzhou
Guangzhou Honda Guangzhou
Tencent Shenzhen
Shenzhen Thunder    Shenzhen
实现方法:
一、首先采用map里面的setup方法获得文件名称(num1还是num2)

二、map处理
对文件sum1处理成如下格式:
1 2,1,Beijing(key=1,value=2,1,Beijing)
对文件sum2处理成如下格式:
1 1,Beijing Red Star,1(key=1,value=1,Beijing Red Star,1)
然后context.write(key,value);
这样文件就写到一起了
1 2,1,Beijing
1 1,Beijing Red Star,1

三、reduce处理
reduce首先判断value的第一个是1还是2
如果是1,那么我们取第二个字符作为key写入(key=Beijing Red Star)
如果是2,那么我们取第三个字符作为value写入(value=Beijing)
context.write(key,value);
那么输出文件就会显示成如下形式:
Beijing Red Star BeiJing
完成!
实现代码:

package Sum_File_Relation;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class Sum_File_Relation {

    static String INPUT_PATH = "hdfs://master:9000/b";
    static String OUTPUT_PATH = "hdfs://master:9000/output";

    static class MyMapper extends Mapper<Object,Object,Text,Text>{  
        Text  output_key=new Text();
        Text  output_value=new Text();
        String k = "";

        //获得文件夹下文件名称信息
         protected void setup(Context context) throws IOException, InterruptedException{
             FileSplit fs = (FileSplit) context.getInputSplit();
             k = fs.getPath().getName();
             System.out.println(k);
         }
        /*对文件sum1处理成如下格式:
        1 2,1,Beijing(key=1,value=2,1,Beijing)
        对文件sum2处理成如下格式:
        1 1,Beijing Red Star,1(key=1,value=1,Beijing Red Star,1)
        然后context.write(key,value);
        这样文件就写到一起了
        1 2,1,Beijing
        1 1,Beijing Red Star,1*/
    protected void map(Object key, Object value, Context context) throws IOException, InterruptedException{

        String[]  tokens=value.toString().split(",");

        if(tokens!=null && tokens.length==2){
            if(k.equals("num1")){
        output_key.set(tokens[0].trim());
        output_value.set(2+","+value);
        context.write(output_key,output_value);
            }else if(k.equals("num2")){
        output_value.set(1+","+value);
        output_key.set(tokens[1]);
        context.write(output_key,output_value);
            }

        System.out.println(tokens[0]+"  -  "+tokens[1]);
        }   
    }   
}

/*reduce首先判断value的第一个是1还是2
  如果是1,那么我们取第二个字符作为key写入(key=Beijing Red Star)
  如果是2,那么我们取第三个字符作为value写入(value=Beijing)
  context.write(key,value); 
  那么输出文件就会显示成如下形式:
  Beijing Red Star BeiJing
  完成!*/
static class MyReduce extends Reducer<Text,Text,Text,Text>{
    Text  output_key=new Text();
    Text  output_value=new Text();
     protected void reduce(Text key, Iterable<Text> values, Context context) 
             throws IOException, InterruptedException{
         List<String> childs=new ArrayList();
         List<String> grands=new ArrayList();

         for(Text line:values){
             String[] tokens=line.toString().split(",");
             if(tokens[0].equals("1")){
                 childs.add(tokens[1]);   
                 System.out.println(1+"=="+tokens[1]);
             }
             else if(tokens[0].equals("2")){
                 grands.add(tokens[2]);
                 System.out.println(2+"=="+tokens[2]);
             }
         }

         //写入数据,每个key对应多个value
         for(String c:childs)
             for(String g:grands){
                 output_key.set(c);
                 output_value.set(g);
                 context.write(output_key, output_value);  
             }
     }
}


    public static void main(String[]args) throws IOException, ClassNotFoundException, InterruptedException{

        Path outputpath = new Path(OUTPUT_PATH);
        Configuration conf = new Configuration();
        FileSystem fs = outputpath.getFileSystem(conf);

        if(fs.exists(outputpath)){
            fs.delete(outputpath,true);
        }
        conf.set("fs.default.name ", "hdfs://master:9000/");

        Job job = Job.getInstance(conf);

        FileInputFormat.setInputPaths(job, INPUT_PATH);
        FileOutputFormat.setOutputPath(job, outputpath);

        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReduce.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.waitForCompletion(true);
    }
}

猜你喜欢

转载自blog.csdn.net/qq_38262266/article/details/79198090