MapReduce倒排索引及Combiner

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接: https://blog.csdn.net/qq_18505209/article/details/100547305
数据:
index1:
I love Beijing and I love China
I love Jinan I love
I love Taian
index2:
Beijing is Beijing is the capital of China
Jinan is the capital city of Shandong
I am am I
index3:
a city in eastern China
the capital of Shandong province
population 2,726,400
I am 
Output:
2,726,400	index3:1;
Beijing	index2:2;idnex1:1;
China	idnex1:1;index2:1;index3:1;
I	idnex1:5;index2:2;index3:1;
Jinan	idnex1:1;index2:1;
Shandong	index2:1;index3:1;
Taian	idnex1:1;
a	index3:1;
am	index2:2;index3:1;
and	idnex1:1;
capital	index2:2;index3:1;
city	index2:1;index3:1;
eastern	index3:1;
in	index3:1;
is	index2:3;
love	idnex1:5;
of	index2:2;index3:1;
population	index3:1;
province	index3:1;
the	index2:2;index3:1;
代码:
package MapReducer05;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.BasicConfigurator;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Set;
import java.util.TreeSet;

public class WordCountJob {
    public static class MyMapper extends Mapper<LongWritable, Text, Text, Text>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            FileSplit fsp = (FileSplit) context.getInputSplit();
            String fname = fsp.getPath().getName();
            String[] str = value.toString().split(" ");
            for(String s : str){
                context.write(new Text(fname+"_"+s), new Text("1"));
            }
        }
    }

    /**
     * I index1:2 index2:1
     * love index1:1
     */
    public static class MyReducer extends Reducer<Text, Text, Text, Text>{
        Set<MyWritable> set = new TreeSet<MyWritable>();

        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

            for (Text t : values){
                MyWritable mw = new MyWritable();
                String[] fac = t.toString().split(":");
                mw.setFname(fac[0]);
                mw.setCount(Integer.parseInt(fac[1]));
                set.add(mw);
            }
            String vres = "";
            for(MyWritable m : set){
                vres += m;
            }
            set.clear();
            context.write(key, new Text(vres));
        }
    }

    public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException {
    	//System.setProperty("hadoop.home.dir", "F:\\hadoop-2.6.4");
        Configuration conf = new Configuration();
        BasicConfigurator.configure();
        Job job = Job.getInstance(conf, "mr");

        job.setJarByClass(WordCountJob.class);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setCombinerClass(MyCombiner.class);

        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job,new Path("C:\\Users\\Chen\\Desktop\\input\\dpsy\\*"));
        FileOutputFormat.setOutputPath(job,new Path("C:\\Users\\Chen\\Desktop\\12"));

        System.exit(job.waitForCompletion(true)?0:1);

    }
}

/**
 * index1_I 1
 * index1_I 1
 * index2_I 1
 * index1_love 1
 */
class MyCombiner extends Reducer<Text, Text, Text, Text>{
    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        String[] k = key.toString().split("_");
        int count = 0;
        for(Text t : values){
            count += Integer.parseInt(t.toString());
        }
        context.write(new Text(k[1]), new Text(k[0]+":"+count));
    }
}
class MyWritable implements WritableComparable<MyWritable> {
    private String fname;
    private int count;
    public MyWritable(){}
    public MyWritable(String fname, int count) {
        this.fname = fname;
        this.count = count;
    }

    public int compareTo(MyWritable o) {
        int ff = o.count-this.count;
        if(ff==0){
            return this.fname.compareTo(o.fname);
        } else{
            return ff;
        }
    }

    @Override
    public String toString() {
        return this.fname+":"+this.count+";";
    }

    public void write(DataOutput out) throws IOException {
        out.writeUTF(fname);
        out.writeInt(count);
    }

    public void readFields(DataInput in) throws IOException {
        this.fname = in.readUTF();
        this.count = in.readInt();
    }

    public String getFname() {
        return fname;
    }

    public void setFname(String fname) {
        this.fname = fname;
    }

    public int getCount() {
        return count;
    }

    public void setCount(int count) {
        this.count = count;
    }
}

猜你喜欢

转载自blog.csdn.net/qq_18505209/article/details/100547305
今日推荐