Lucene同义词检索同时精确提取自定义关键词(Lucene版本5.3.0)

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/EchoYY/article/details/78468225

此博文针对的是Lucene版本5.3.0,若您的Lucene版本为3.X,请移步这里http://write.blog.csdn.net/postedit/78291868(只提取关键词,未包含同义词检索)


本篇文章包含两个功能

1、精确提取自定义关键词

2、同义词检索与提取


废话不多说,直接撸代码


定义同义词分词类如下

package com.daelly.sample.lucene.analyzer.synonyms;

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.wltea.analyzer.lucene.IKTokenizer;

public class SynonymsAnalyzer extends Analyzer {
	
	private final String synonymsPath;
	
	public SynonymsAnalyzer(String synonymsPath) {
		if(synonymsPath==null || synonymsPath.isEmpty()) {
			throw new IllegalArgumentException("synonymsPath must be provided!");
		}
		this.synonymsPath = synonymsPath;
	}

	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
		SynonymFilterFactory factory = null;
		try {
			factory = getSynonymFilterFactory();
		} catch (IOException e) {
			e.printStackTrace();
		}
		Tokenizer tokenizer = new IKTokenizer(true);
		if(factory != null) {
			TokenStream tokenStream = factory.create(tokenizer);
			return new TokenStreamComponents(tokenizer,tokenStream);
		}
		return new TokenStreamComponents(tokenizer);
	}
	
	private SynonymFilterFactory getSynonymFilterFactory() throws IOException {
		if(synonymsPath.contains("classpath:")) {
			String path = synonymsPath.replace("classpath:", "");
			Map args = new HashMap<String,String>();
			args.put("synonyms", path);
			SynonymFilterFactory factory = new SynonymFilterFactory(args );
			factory.inform(new ClasspathResourceLoader());
			return factory;
		}
		int index = synonymsPath.lastIndexOf(File.separator);
		String dir = synonymsPath.substring(0,index);
		String name = synonymsPath.substring(index+1);
		Map args = new HashMap<String,String>();
		args.put("synonyms", name);
		SynonymFilterFactory factory = new SynonymFilterFactory(args);
		Path baseDirectory = Paths.get(dir);
		FilesystemResourceLoader loader = new FilesystemResourceLoader(baseDirectory);
		factory.inform(loader);
		return factory;
	}

}


添加索引类,其中INDEXDIR是索引路径,DATADIR是待索引文件路径, ACTIONDIR是同义词文件路径

package com.apache.luence;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;

import com.daelly.sample.lucene.analyzer.synonyms.SynonymsAnalyzer;

public class AddIndex {

	private static final String INDEXDIR = "D:\\TestSolr\\Index\\Test"; 
	private static final String DATADIR = "D:\\TestSolr\\src\\resource\\node.dic";
	private static final String ACTIONDIR = "D:\\TestSolr\\src\\resource\\data\\action.txt";
	
	public AddIndex() {
		try {
			Directory directory = FSDirectory.open(Paths.get(INDEXDIR));
			IndexWriterConfig config = new IndexWriterConfig(new SynonymsAnalyzer(ACTIONDIR));
			IndexWriter iwriter = new IndexWriter(directory, config);

			File files = new File(DATADIR);
			List<String> contents = this.getContent(files);
			for (String content : contents) {
			    Document doc = new Document();
			    doc.add(new TextField("content",content,Field.Store.YES));
			    iwriter.addDocument(doc);
			}
			iwriter.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	private List<String> getContent(File files) {
		List<String> strList = new ArrayList<String>();
		try {
			InputStream stream = new FileInputStream(files);
			String code = "UTF-8";
			BufferedReader br = new BufferedReader(new InputStreamReader(stream, code));
			String str = br.readLine();
			while (str != null) {
				strList.add(str);
				str = br.readLine();
			}
			br.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return strList;
	}
	
	public static void main(String[] args) {
		AddIndex a = new AddIndex();
	}
}


检索类
package com.apache.luence;

import java.io.IOException;
import java.io.StringReader;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.swing.plaf.synth.SynthSpinnerUI;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;


/**
 * 
 * 通过索引字段来读取文档
 *
 */
public class IKUtil {
	
	private  static final String INDEXDIR = "D:\\TestSolr\\Index\\Test";

	private boolean search(String keyword){
		boolean flag = false;
		Directory directory = null;
		// 读取索引并查询
		DirectoryReader ireader = null;
		TopDocs hits = null;
		try {
			flag = false;
			directory = FSDirectory.open(Paths.get(INDEXDIR));
			ireader = DirectoryReader.open(directory);
			IndexSearcher isearcher = new IndexSearcher(ireader);
			Analyzer analyzer=new IKAnalyzer(true);
			TermQuery query = new TermQuery(new Term("content", keyword));
			 hits=isearcher.search(query, 10);
		} catch (IOException e) {
			e.printStackTrace();
		}
		if(hits!=null&&hits.totalHits>0) {
			flag = true;
		}
		try {
			ireader.close();
			directory.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return flag;
	}

	/**
	 * 获取输入文本中的关键词
	 * @param sInput
	 * @return
	 */
	public String[] getKeyWords(String sInput){
		List<String> result = new ArrayList<String>();
		Map<String, String> map = new HashMap<String, String>();
		int i=0;
		try {
			IKAnalyzer analyzer = new IKAnalyzer(true);
			TokenStream tokenStream = analyzer.tokenStream("content",new StringReader(sInput));
		    CharTermAttribute term = (CharTermAttribute)tokenStream.addAttribute(CharTermAttribute.class);
		    tokenStream.reset();
			while (tokenStream.incrementToken()) {
				if(this.search(term.toString())==true) {
					if (!map.containsValue(term.toString())) {
		                map.put("key" + i, term.toString());
		                i++;
		            }
				}
			}
			tokenStream.end();
		} catch (Exception e) {
			e.printStackTrace();
		}
		for(int j=0;j<map.size();j++) {
			result.add( map.get("key" + j));
		}
		return (String[])result.toArray(new String[result.size()]);
	}
	
	/**
	 * 获取同义词列表里的同义词
	 * @param src
	 * @return
	 */
	public List<String> getSynonyms(String src) {
		List<String> results = new ArrayList<String>();
		try {
			Term term = new Term("content", src);
			Query query = new TermQuery(term);
			Directory directory = FSDirectory.open(Paths.get(INDEXDIR));
			IndexReader reader = DirectoryReader.open(directory);
			IndexSearcher searcher = new IndexSearcher(reader);
			TopDocs docs = searcher.search(query, 10);
			for(ScoreDoc scoreDoc : docs.scoreDocs) {
				Document doc = searcher.doc(scoreDoc.doc);
				String synonyms = doc.get("content");
				if("".equals(synonyms)&&synonyms==null) {
					return null;
				}
				results.add(synonyms);
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		return results;
	}
	public static void main(String[] args) {
		String input = "这是一整个关键词哈哈";
		String[] results = new IKUtil().getKeyWords(input);
//		List<String> results = new IKUtil().getSynonyms(input);
		for(String result:results) {
			System.out.println(result);
		}
	}
}

首先测试一下精确提取关键词功能

索引文件如图

运行结果如图



如图可以看到"哈哈”被过滤掉了


下面再测试一下同义词功能

如图是同义词文件


这里需要空一行(其实我也不知道为什么……),然后编码格式utf-8

检验一下,还是用之前建立的索引文件


如图,如果还记得索引文件话会发现里面没有hello这个关键词,但是仍被识别出来了。

再检验一下搜索同义词的功能

这里更换一下注释的行数,选择134行被注掉的代码,同时注掉133行,运行结果如图



bingo

剩下的就需要各位小伙伴按需求进行修改就好了



猜你喜欢

转载自blog.csdn.net/EchoYY/article/details/78468225
今日推荐