1, 下载lucene, 获取demo相关jar包

https://lucene.apache.org/core/downloads.html
在这里插入图片描述

2, 源代码实现明细

配置pom.xml

 		<dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-demo</artifactId>
            <version>8.6.0</version>
        </dependency>
        
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-core</artifactId>
            <version>8.6.0</version>
        </dependency>
      
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-queryparser</artifactId>
            <version>8.6.0</version>
        </dependency>

org.apache.lucene.demo.IndexFiles : StandardAnalyzer index阶段

使用 StandardAnalyzer 来解析文件内容

// Index all text files under a directory.
public class IndexFiles {
    
    
  /** Index all text files under a directory. */
  public static void main(String[] args) {
    
    
    //指定源文档的路径：绝对路径 或 相对路径
    //cmd> ls D:\download\lucene-7.7.3\demo\txt\
    //      123.txt  456.txt  hello123.txt  hello12345678.txt
    String docsPath = "D:\\download\\lucene-7.7.3\\demo\\txt";
    //指定索引存放的位置：绝对路径 或 相对路径
    String indexPath = "D:\\download\\lucene-7.7.3\\demo\\index1";
    //是创建索引/ 还是更新索引
    boolean create = true;

    final Path docDir = Paths.get(docsPath);
    Date start = new Date();
    System.out.println("Indexing to directory '" + indexPath + "'...");

    Directory indexPathDir = FSDirectory.open(Paths.get(indexPath));
    Analyzer analyzer = new StandardAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

    if (create) {
    
    
      // Create a new index in the directory, removing old indexs
      iwc.setOpenMode(OpenMode.CREATE);
    } else {
    
    
      // Add new documents to an existing index:
      iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
    }

    IndexWriter writer = new IndexWriter(indexPathDir, iwc);
    indexDocs(writer, docDir);

    // NOTE: if you want to maximize search performance,
    // you can optionally call forceMerge here.  This can be
    // a terribly costly operation, so generally it's only
    // worth it when your index is relatively static (ie
    // you're done adding documents to it):
    //
    // writer.forceMerge(1);
    writer.close();
    Date end = new Date();
    System.out.println(end.getTime() - start.getTime() + " total milliseconds");
  }

  /**
   * Indexes the given file using the given writer, or if a directory is given,
   * 
   * NOTE: This method indexes one document per input file.  This is slow.  For good
   * throughput, put multiple documents into your input file(s).  An example of this is
   * in the benchmark module, which can create "line doc" files, one document per line,
   * using the
   * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
   * >WriteLineDocTask</a>.
   *  
   * @param writer Writer to the index where the given file/dir info will be stored
   * @param path The file to index, or the directory to recurse into to find files to index
   */
  static void indexDocs(final IndexWriter writer, Path path) throws IOException {
    
    
    if (Files.isDirectory(path)) {
    
    
      Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
    
    
        @Override
        public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
    
    
          try {
    
    
            indexDoc(writer, file, attrs.lastModifiedTime().toMillis());
          } catch (IOException ignore) {
    
    
            // don't index files that can't be read.
          }
          return FileVisitResult.CONTINUE;
        }
      });
    } else {
    
    
      indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());
    }
  }

  /** Indexes a single document */
  static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    
    
    try (InputStream stream = Files.newInputStream(file)) {
    
    
      // make a new, empty document
      Document doc = new Document();
      
      // Add the path of the file as a field named "path".  Use a
      Field pathField = new StringField("path", file.toString(), Field.Store.YES);
      doc.add(pathField);
      
      // Add the last modified date of the file a field named "modified".
      doc.add(new LongPoint("modified", lastModified));
      
      // Add the contents of the file to a field named "contents".  Specify a Reader,
      // so that the text of the file is tokenized and indexed, but not stored.
      doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
      
      if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
    
    
        // New index, so we just add the document (no old document can be there):
        System.out.println("adding " + file);
        writer.addDocument(doc);
      } else {
    
    
        // Existing index (an old copy of this document may have been indexed) so 
        // we use updateDocument instead to replace the old one matching the exact 
        // path, if present:
        System.out.println("updating " + file);
        writer.updateDocument(new Term("path", file.toString()), doc);
      }
    }
  }
}

org.apache.lucene.demo.SearchFiles : StandardAnalyzer query阶段

在这里插入图片描述

// search demo.
public class SearchFiles {
    
    
    public static void main(String[] args) throws Exception {
    
    
        //指定索引存放的位置：绝对路径 或 相对路径
        String index = "D:\\download\\lucene-7.7.3\\demo\\index1";
        //搜索的文本内容
        String queryString = "hello";
        //搜索的文本内容 --- 从哪个索引字段检索
        String field = "contents";
        //分页大小
        int hitsPerPage = 10;
        //是否查看：更详细的内容( 搜索配置的score )
        boolean raw = true;

        IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
        IndexSearcher searcher = new IndexSearcher(reader);
        Analyzer analyzer = new StandardAnalyzer();

        QueryParser parser = new QueryParser(field, analyzer);
        Query query = parser.parse(queryString);
        System.out.println("Searching for: " + query.toString(field));
        doSearch(searcher, query, hitsPerPage, raw);
        reader.close();
    }

    //查询
    public static void doSearch(IndexSearcher searcher, Query query, int hitsPerPage, boolean raw) throws IOException {
    
    
        BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
        // Collect enough docs to show 5 pages
        TopDocs results = searcher.search(query, 5 * hitsPerPage);
        ScoreDoc[] hits = results.scoreDocs;

        int numTotalHits = Math.toIntExact(results.totalHits.value);
        System.out.println(numTotalHits + " total matching documents ");

        System.out.println("\n\n>>>>>>>> doc details >>>>>>>>>");
        for (int i = 0; i < numTotalHits; i++) {
    
    
            if (raw) {
    
                                  // output raw format
                System.out.println("doc=" + hits[i].doc + " score=" + hits[i].score);
                //doc=2 score=0.29767057
            }

            Document doc = searcher.doc(hits[i].doc);
            String path = doc.get("path");
            if (path != null) {
    
    
                System.out.println((i + 1) + ". " + path);
                //1. D:\download\lucene-7.7.3\demo\txt\hello123.txt
                String title = doc.get("title");
                if (title != null) {
    
    
                    System.out.println("   Title: " + doc.get("title"));
                }
            } else {
    
    
                System.out.println((i + 1) + ". " + "No path for this document");
            }
          System.out.println();
        }
    }
}

StandardAnalyzer 中文分词示例: index, query

 	@org.junit.Test
    public void analyzeTest() throws IOException {
    
    
        StringReader stopwords = new StringReader("the \n bigger");
        StringReader stringReader = new StringReader("The quick BIGGER brown   fox jumped over the bigger lazy dog. ");

        //StandardAnalyzer: 内置 LowerCaseFilter, StopFilter
        StandardAnalyzer analyzer = new StandardAnalyzer(stopwords);
        TokenStream tokenStream = analyzer.tokenStream("contents, ", stringReader);
        //    final StandardTokenizer src = new StandardTokenizer();
        //    TokenStream tok = new LowerCaseFilter(src);
        //    tok = new StopFilter(tok, stopwords);
        tokenStream.reset();
        CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
        while (tokenStream.incrementToken()) {
    
    
            System.out.print("[" + term.toString() + "] ");
            //[quick] [brown] [fox] [jumped] [over] [lazy] [dog]
        }
        tokenStream.close();
    }

3, 中文词库下载：同义词，停用词

链接地址： https://github.com/fighting41love/funNLP
直接下载github文件：将 github.com 替换为 raw.githubusercontent.com，并去除 /blob:

url="https://github.com/guotong1988/chinese_dictionary/blob/master/dict_synonym.txt"
echo $url|sed -e '[email protected]@raw.githubusercontent.com@' -e 's@blob/@@'

生成solr格式的停用词：逗号分隔

public static void main(String[] args) throws IOException {
    
    
        // 创建字符流对象，并根据已创建的字节流对象创建字符流对象
        String file = "D:\\download\\solr-近义词，停用词\\synonym.txt";
        String outfile = "D:\\download\\solr-近义词，停用词\\synonym2.txt";
        BufferedWriter bw = new BufferedWriter(new FileWriter(outfile));
        BufferedReader raf = new BufferedReader(new FileReader(file));

        //同义词
        //Aa01A01= 人 士 人物 人士 人氏 人选
        //Aa01A02= 人类 生人 全人类
        String s = null;
        while ((s = raf.readLine()) != null) {
    
    
            String[] arr = s.split("=");
            if (arr.length < 2) continue;
            String[] arr2 = arr[1].split("\\s");

            for (int i=0;i <arr2.length; i++){
    
    
                if (i != arr2.length -1 ){
    
    
                    if ( ! arr2[i].trim().equals("")){
    
    
                        System.out.print(arr2[i]+",");
                        bw.write(arr2[i]+",");
                    }
                }else {
    
    
                    System.out.print(arr2[i]);
                     bw.write(arr2[i]);
                }
            }
            System.out.println();
            bw.write("\n");
        }
        bw.flush();
        bw.close();
        raf.close();
}
//人,士,人物,人士,人氏,人选
//人类,生人,全人类
//人手,人员,人口,人丁,口,食指
//劳力,劳动力,工作者

理解solr工作原理：lucene

文章目录

1, 下载lucene, 获取demo相关jar包

2, 源代码实现明细

org.apache.lucene.demo.IndexFiles : StandardAnalyzer index阶段

org.apache.lucene.demo.SearchFiles : StandardAnalyzer query阶段

StandardAnalyzer 中文分词示例: index, query

3, 中文词库下载：同义词，停用词

猜你喜欢