Hadoop学习:MapReduce实现倒排索引

## ✌✌✌古人有云,好记性不如烂笔头,千里之行,始于足下,每日千行代码必不可少,每日总结写一写,目标大厂,满怀希望便会所向披靡,哈哈哈!!!✌✌✌

在这里插入图片描述

一、✌题目要求

文件1:a.txt
在这里插入图片描述

文件2:b.txt
在这里插入图片描述

文件3:c.txt
在这里插入图片描述
最终输出格式:
在这里插入图片描述

二、✌实现思想

> 首先在map阶段,获得每个单词所在的文件名称
> 然后在方法中,每个单词作为Key,所在文件名称+1作为Value
> 在Reduce阶段,针对每个Key,对他们的Value迭代,将Value切割获得个数,不断累加
> 最终按照指定格式写出

三、✌代码实现

1.✌Map类

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class Map extends Mapper<LongWritable, Text, Text, Text> {
    
    

    String name;

	//获得切片文件名称
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
    
    

        FileSplit inputSplit = (FileSplit) context.getInputSplit();

        name = inputSplit.getPath().getName();

    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    
    

        String line = value.toString();

        String[] words = line.split(" ");

		//输出格式为:apple  a.txt2   pear  b.txt1
        for (String word : words) {
    
    
            context.write(new Text(word), new Text(name + 1));
        }

    }

}

2.✌Reduce类

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class Reduce extends Reducer<Text, Text, Text, Text> {
    
    

    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    
    

        int a_sum = 0, b_sum = 0, c_sum = 0;
		
		//计数,对每个key
        for (Text value : values) {
    
    

            if (value.toString().contains("a.txt")) {
    
    
                a_sum += 1;
            } else if (value.toString().contains("b.txt")) {
    
    
                b_sum += 1;
            } else {
    
    
                c_sum += 1;
            }
        }

		//输出格式:apple	a.txt-->3	b.txt-->1	c.txt-->2
        context.write(key, new Text("a.txt-->" + a_sum + "\t" + "b.txt-->" + b_sum + "\t" + "c.txt-->" + c_sum + "\t"));

    }

}

3.✌Driver类

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.BasicConfigurator;

import java.io.IOException;

public class Driver {
    
    

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    
    

		//配置文件路径
        args = new String[]{
    
    "D:/input/inputword", "D:/output"};

		//打印日志信息
        BasicConfigurator.configure();
		
		//设置配置文件
        Configuration conf = new Configuration();

		//获得Job对象
        Job job = Job.getInstance(conf);
	
		//关联Map、Reduce、Driver类
        job.setJarByClass(Driver.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);

		//设置Map输出格式
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

		//设置最终输出格式
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

		//设置文件路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
	
		//提交任务
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
        
    }
    
}

猜你喜欢

转载自blog.csdn.net/m0_47256162/article/details/115029783