lucene Analyzer 分词 一

package com.hb;

import java.io.Reader;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LetterTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;

public class MyStopAnalyzer extends Analyzer {
	private Set stops;

	public MyStopAnalyzer(String[] strs){
		//会自动将字符串数据转为set
		stops = StopFilter.makeStopSet(Version.LUCENE_35, strs, true);
		//将原有的停用词加入到现在的停用词中
		stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
	}
	
	public MyStopAnalyzer(){
		//获取原有的停用词
		stops = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
	}
	
	@Override
	public TokenStream tokenStream(String fieldname, Reader reader) {
		//为这个分词器设定过滤链和Tokenizers
		return new StopFilter(Version.LUCENE_35, 
				new LowerCaseFilter(Version.LUCENE_35, new LetterTokenizer(Version.LUCENE_35, reader)), 
				stops);
	}

}

猜你喜欢

转载自hbiao68.iteye.com/blog/2108307
今日推荐