利用MapReduce来实现全局搜索引擎
根据内容来查看文档,可以统计每个单词在一些文档中出现了几次,来实现全文检索的这样的一个功能
预备文件:
hadoop中分三步走:
1.mapper对文档初步处理, 获得每个单词以及单词的路径,设置每个单词出现的次数都初步设置为1;
输出格式 : 单词||文档uri 1;
2.combiner对于每个文档同样的单词初步的合计统计次数并输出到reducer
合并每个文件单词出现的次数,也就是词频
输出格式: 单词 uri------词频
3.reducer经过shuffer处理形成最终的文件
输出格式; 单词 uri------词频;uri-------词频;
代码展示:
package demo01.hadoop;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class InvertedIndex extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.out.println("args error!");
return -1;
}
Path src = new Path(args[0]);
Path desc = new Path(args[1]);
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
if (fs.exists(desc)) {
fs.delete(desc, true);
}
Job job = Job.getInstance(conf,"倒排索引");
job.setJarByClass(getClass());
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class); //这是reducer的东西
job.setMapOutputValueClass(Text.class);
job.setCombinerClass(MyCombiner.class);
//job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, src);
FileOutputFormat.setOutputPath(job, desc);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int code = ToolRunner.run(new InvertedIndex(), args);
System.exit(code);
}
/**
*
* @author hp 输出格式 : key单词:文档uri value每个单词设置出现次数为1;
*
*/
public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
String uri;
Text key2 = new Text();
Text value2 = new Text();
@Override
public void setup(Mapper<LongWritable, Text, Text,Text>.Context context)
throws IOException, InterruptedException {
FileSplit split = (FileSplit) context.getInputSplit();
this.uri = split.getPath().toString();
}
@Override
public void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text,Text>.Context context)
throws IOException, InterruptedException {
String[] strs = value.toString().split("\\s+");
for (String str : strs) {
key2.set(str + "||" + uri);
value2.set("1");
context.write(key2, value2);
}
}
}
/**
* 合并每个文件单词出现的次数,也就是词频 输出格式: key单词 value uri+每个文档中的词频
*/
public static class MyCombiner extends Reducer<Text, Text, Text, Text> {
Text key4 = new Text();
Text value4 = new Text();
@Override
public void reduce(Text key3, Iterable<Text> value3,
Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
int sum = 0;
for (Text v3 : value3) {
sum += Integer.parseInt(v3.toString());
}
String word = key3.toString().substring(0, key3.toString().indexOf("||"));
key4.set(word);
int pos = key3.toString().length();
String uri = key3.toString().substring(key3.toString().indexOf("||") + 2, pos);
value4.set(uri + "-------" +sum);
context.write(key4, value4);
}
}
public static class MyReducer extends Reducer<Text, Text, Text,Text>{
Text key6 = new Text();
Text value6 = new Text();
@Override
public void reduce(Text key5, Iterable<Text> value5, Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
StringBuffer sb = new StringBuffer();
for (Text v5 : value5) {
sb.append(v5 + ";");
}
key6.set(key5.toString());
value6.set(sb.toString());
context.write(key6, value6);
}
}
}
欢迎提出见解跟指导