1. 准备需要4个jar包
ansj_seg-5.1.6.jar
nlp-lang-1.7.7.jar
ansj_lucene5_plug-5.1.1.2.jar
AnsjTokenizerFactory.jar(这个包是自己打包的)
maven库中去下载地址:http://mvnrepository.com
http://mvnrepository.com/artifact/org.ansj
创建一个普通的java项目引入包
AnsjTokenizerFactory.java 代码
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.ansj.lucene.util.AnsjTokenizer;
import org.ansj.recognition.impl.StopRecognition;
import org.ansj.splitWord.analysis.IndexAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
public class AnsjTokenizerFactory extends TokenizerFactory {
boolean pstemming;
boolean isQuery;
private String stopwordsDir;
public List<StopRecognition> filters;
public AnsjTokenizerFactory(Map<String, String> args) {
super(args);
filters = new ArrayList<StopRecognition>();
getLuceneMatchVersion();
isQuery = getBoolean(args, "isQuery", true);
pstemming = getBoolean(args, "pstemming", false);
stopwordsDir = get(args, "stopwords");
addStopwords(stopwordsDir);
}
// add stopwords list to filter
private void addStopwords(String dir) {
if (dir == null) {
System.out.println("no stopwords dir");
return;
}
// read stoplist
System.out.println("stopwords: " + dir);
File file = new File(dir);
InputStreamReader reader;
try {
reader = new InputStreamReader(new FileInputStream(file), "UTF-8");
BufferedReader br = new BufferedReader(reader);
StopRecognition testFilter = new StopRecognition();
String word = br.readLine();
while (word != null) {
testFilter.insertStopWords(word);
word = br.readLine();
}
filters.add(testFilter);
br.close();
} catch (FileNotFoundException e) {
System.out.println("No stopword file found");
} catch (IOException e) {
System.out.println("stopword file io exception");
}
}
@Override
public Tokenizer create(AttributeFactory factory) {
if (isQuery == true) {
// query
return new AnsjTokenizer(new ToAnalysis(), filters, null);
} else {
// index
return new AnsjTokenizer(new IndexAnalysis(), filters, null);
}
}
}
右击项目选择export导出jar包,记得导出的时候不要连带的将lib下依赖的jar包也导进去,标红的地方不要打勾选
2.将这四个jar包放到tomcat里apache-tomcat-solr\webapps\solr\WEB-INF\lib文件夹下面
3.solr_home文件夹下的修改
修改/conf/schema.xml文件
<fieldType name="text_ansj" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="com.XXX.ansj.AnsjTokenizerFactory" isQuery="false" pstemming="true" stopwords="E:/XXXXXX/apache-tomcat-solr/bin/library/stop.dic"/>
</analyzer>
<analyzer type="query">
<tokenizer class="com.xxx.ansj.AnsjTokenizerFactory" isQuery="false" pstemming="true" stopwords="E:/XXXXXX/apache-tomcat-solr/bin/library/stop.dic"/>
</analyzer>
</fieldType>
<tokenizer class="com.XXX.ansj.AnsjTokenizerFactory" isQuery="false" pstemming="true" stopwords="E:/XXXXXX/apache-tomcat-solr/bin/library/stop.dic"/>
class名就是刚才打包的全类名
stopwords 就是停用词典的路径,stopwords和AnsjTokenizerFactory.java里的要一致,也可以命名为words或其他的但这两个地方要一致
4.引入ansj_seg的词典
在tomcat中solr项目文件夹下的WEB-INF中看有没有classes文件夹,没有的话新建,将ansj_seg的词典文件夹和properties文件移到classes文件夹中(这个ansj_seg-master是我从github上下载下来的,地址: https://github.com/NLPchina/ansj_seg)
也可以直接把词典放到tomcat的bin目录里 apache-tomcat-solr\bin
参考博客:https://blog.csdn.net/allthesametome/article/details/46907197
solr4 + ansj_seg的整合:https://blog.csdn.net/liujun_for_java/article/details/80514226