Lucene是简单而功能强大的基于Java的搜索库。它可以用于任何应用程序的搜索功能。它是可扩展的,高性能的库用于索引和搜索几乎任何类型的文本。
项目中使用Lucene做业务菜单的搜索功能。客户可以通过输入业务菜单的部分文字,通过Lucene检索,查询到相符合的菜单目录进行业务操作。闲话不说,本人根据项目中Lucene的使用情况结合新版的(6.6)Lucene使用情况写了个DEMO用于学习。
首先是DEMO中Lucene使用的公共常量类。
/** * lucene常量类 * @author zhouyi * */ public class LuceneConstants { public static final String CONTENTS = "contents" ; public static final String FILE_NAME = "filename"; public static final String FILE_PATH = "filepath" ; public static final int MAX_SEARCH = 10 ; //搜索数目为10条 }
然后对需要索引的文件做类别区分,这里暂时只对TXT文件进行索引。
import java.io.File;
import java.io.FileFilter;
public class TextFileFilter implements FileFilter {
@Override
public boolean accept(File pathname) {
return pathname.getName().toLowerCase().endsWith(".txt");
}
}
下面开始对需要检索的文件建立索引。注意:新版的Lucene使用了NIO2中一系列方式,摈弃了File等IO的方式。
import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.SimpleFileVisitor; import java.nio.file.attribute.BasicFileAttributes; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import cn.zhouyi.demo.lucene.LuceneConstants; public class Indexer { //索引创建类 private IndexWriter writer ; public Indexer(String indexDirectoryPath) throws IOException{ //读取需要索引的文件到Lucene的目录类中,新版的Lucene只支持IO2中的Path类型的变量了。 Directory indexDirectory = FSDirectory.open(Paths.get(indexDirectoryPath)) ; //创建分词器,这个分词器必须和IndexSearcher中的一致。 Analyzer analyzer = new StandardAnalyzer() ; //新版的Lucene中索引创建类只接收IndexWriterConfig配置。 IndexWriterConfig iwc = new IndexWriterConfig(analyzer); writer = new IndexWriter(indexDirectory, iwc); } public void close() throws CorruptIndexException, IOException{ writer.close(); } //给文件创建索引 private void indexFile(Path path) throws IOException{ //因为使用了Java7的try(),所以文件流的操作必须在try()中写完,否则会自动关闭流。 try(InputStream stream = Files.newInputStream(path)){ //建立Lucene文档 Document document = new Document() ; Field contentField = new TextField(LuceneConstants.CONTENTS, new BufferedReader(new InputStreamReader(stream,StandardCharsets.UTF_8))) ; Field fileNameField = new StringField(LuceneConstants.FILE_NAME, path.getFileName().toString(), Field.Store.YES); Field filePathField = new StringField(LuceneConstants.FILE_PATH, path.toString(), Field.Store.YES); document.add(contentField); document.add(fileNameField); document.add(filePathField); System.out.println("Indexing "+path.toString()); //写入文档到索引创建类中 writer.addDocument(document) ; } } //遍历文件目录下的文件,给这些文件加索引 public int createIndex(String docPath, FileFilter filter) throws IOException{ Path path = Paths.get(docPath) ; if(!Files.isReadable(path)){ System.out.println("Document Directory '"+path.toAbsolutePath()+ "'is not readable or is not exist"); System.exit(1); } if(Files.isDirectory(path)){ //NIO2中优雅地遍历文件 Files.walkFileTree(path, new SimpleFileVisitor<Path>(){ @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs){ try{ if(filter.accept(file.toFile())){ indexFile(file) ; } }catch(IOException ex){ ex.printStackTrace(); } return FileVisitResult.CONTINUE; } }) ; }else{ if(filter.accept(path.toFile())){ indexFile(path) ; } } return writer.numDocs() ; } }上面给对应目录的文件创建好了分词索引后,下面开始读取索引进行搜索。
import java.io.IOException; import java.nio.file.Paths; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import cn.zhouyi.demo.lucene.LuceneConstants; public class Searcher { //索引搜索类 private IndexSearcher indexSearcher ; //索引读取类 private IndexReader reader ; //将用户的搜索条件封装成Lucene的query条件 private QueryParser queryParser ; private Query query ; public Searcher(String indexDirectoryPath) throws IOException{ //将索引文件读取到lucene的索引读取类中 Directory directory = FSDirectory.open(Paths.get(indexDirectoryPath)); reader = DirectoryReader.open(directory); //创建索引搜索类 indexSearcher = new IndexSearcher(reader) ; //此处分词器需要和索引类中的一致 Analyzer analyzer = new StandardAnalyzer(); queryParser = new QueryParser(LuceneConstants.CONTENTS, analyzer); } //根据用户的搜索条件返回lucene搜索的文档 public TopDocs search(String searchQuery) throws ParseException, IOException{ query = queryParser.parse(searchQuery); return indexSearcher.search(query, LuceneConstants.MAX_SEARCH) ; } //根据文档的id获取文档,注scoreDoc=TopDocs.scoreDocs[i] public Document getDocument(ScoreDoc scoreDoc) throws IOException{ return indexSearcher.doc(scoreDoc.doc); } public void close() throws IOException{ reader.close(); } }以上简单的索引类和搜索类已经写好了,下面写一个测试类来测试一下lucene的功能。
import java.io.IOException; import org.apache.lucene.document.Document; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import cn.zhouyi.demo.lucene.LuceneConstants; public class LuceneTester { String indexDir = "E:\\code\\lucence\\index" ; String dataDir = "E:\\code\\lucence\\docs" ; Indexer indexer ; Searcher searcher ; public static void main(String args[]){ LuceneTester tester ; try{ tester = new LuceneTester() ; tester.createIndex(); tester.search("you"); }catch(Exception ex){ ex.printStackTrace(); } } private void createIndex() throws IOException{ indexer = new Indexer(indexDir) ; int numIndexed ; long startTime = System.currentTimeMillis() ; //numIndexed = indexer.createIndex(dataDir, new TextFileFilter()) ; //使用一下Java8的新特性来实现一下文件的筛选。 numIndexed = indexer.createIndex(dataDir, (pathname)->{ return pathname.getName().toLowerCase().endsWith(".txt"); }); long endTime = System.currentTimeMillis() ; indexer.close(); System.out.println(numIndexed+" File indexed, time taken: "+(endTime-startTime)+" ms"); } private void search(String searchQuery) throws IOException, ParseException{ searcher = new Searcher(indexDir); long startTime = System.currentTimeMillis(); TopDocs hits = searcher.search(searchQuery); long endTime = System.currentTimeMillis(); System.out.println(hits.totalHits+" documents found. Time :" + (endTime - startTime)); for(ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = searcher.getDocument(scoreDoc); System.out.println("File: " + doc.get(LuceneConstants.FILE_PATH)); } searcher.close(); } }执行上面测试类得到的结果:
Indexing E:\code\lucence\docs\doc1.txt 1 File indexed, time taken: 105 ms 1 documents found. Time :24 File: E:\code\lucence\docs\doc1.txt收工完毕。