分词工具英文版的是用标准版的,即StandardAnalyzer
中文分词是用SmartChineseAnalyzer,lucene包中有
使用的junit4.0测试的
import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.Version; import org.junit.Test; public class Analyzertest { //Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_31); //String text = "我是中国人"; String text = "IndexWriter javadoc a apach2.0.txt"; @Test public void test () { try { analyzer(analyzer,text); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } public void analyzer(Analyzer al, String text) throws Exception { TokenStream tokeStream = al.tokenStream("content", new StringReader(text)); //TermAttribute 已过时,文档中推荐使用CharTermAttribute tokeStream.addAttribute(CharTermAttribute.class); while(tokeStream.incrementToken()) { CharTermAttribute ta = tokeStream.getAttribute(CharTermAttribute.class); System.out.println(ta.toString()); //System.out.println(tokeStream.toString()); } } }