分词去停用词操作

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.HashSet;
import java.util.Set;

import ICTCLAS.I3S.AC.ICTCLAS50;

public class FileExcludeStopWord {

//停用词词表
public static final String stopWordTable = "停用词表路径";
public static void main(String[] args) {

//源文件和目的文件
String srcFile = "源文件路径";
String destFile = "目的文件路径";
new FileExcludeStopWord().fileExcludeStopWord(srcFile, destFile);

}

public void fileExcludeStopWord(String srcFile,String destFile){
try {
//读取原文件和停用词表
BufferedReader srcFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(srcFile))));
BufferedReader StopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(stopWordTable))));

//将去除停用词的文本信息存入输出文件
BufferedWriter destFileBw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(destFile))));

//用来存放停用词的集合
Set stopWordSet = new HashSet<String>();

//初如化停用词集
String stopWord = null;
for(; (stopWord = StopWordFileBr.readLine()) != null;){
stopWordSet.add(stopWord);
}

//分词工具
ICTCLAS50 ICTCLAS = new ICTCLAS50();
// 初始化分词所用库的路径
String argu = ".";
if (ICTCLAS.ICTCLAS_Init(argu.getBytes("gb2312")) == false) {
System.out.println("分词所用库初始化失败。");
return;
}

String paragraph = null;
for(; (paragraph = srcFileBr.readLine()) != null;){
//对读入的文本进行分词
byte[] spiltResult = ICTCLAS.ICTCLAS_ParagraphProcess(paragraph.getBytes("gb2312"), 2, 0);
String spiltResultStr = new String(spiltResult,0,spiltResult.length,"gb2312");

//得到分词后的词汇数组,以便后续比较
String[] resultArray = spiltResultStr.split(" ");

//过滤停用词
for(int i = 0; i< resultArray.length; i++){
if(stopWordSet.contains(resultArray[i])){
resultArray[i] = null;
}
}

//把过滤后的字符串数组存入到一个字符串中
StringBuffer finalStr = new StringBuffer();
for(int i = 0; i< resultArray.length; i++){
if(resultArray[i] != null){
finalStr = finalStr.append(resultArray[i]).append(" ");
}
}

//将过滤后的文本信息写入到指定文件中
destFileBw.write(finalStr.toString());
destFileBw.newLine();
}

//关闭输入流
destFileBw.close();
StopWordFileBr.close();
srcFileBr.close();


} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch(Exception e){
e.printStackTrace();
}
}

}

猜你喜欢

转载自forever1220.iteye.com/blog/2097471