目录
Sentence Detector
语句检测器,OpenNLP语句检测器可以检测标点字符是否标记了句子的结尾。在这个意义上,句子被定义为两个标点符号之间最长的空白字符序列。第一句和最后一句是对这条规则的例外。第一个非空白字符被假定为句子的开头,最后一个非空白字符被假定为句子的结尾。
通常句子检测是在文本被分词之前完成的,但是也可以先执行分词,让句子检测器处理已经分词的文本。OpenNLP语句检测器无法根据句子内容识别句子边界。如文章中的标题被误认为是第一句话的第一部分。OpenNLP中的大多数组件都期望输入被分割成句子。
Sentence Detector的输入是一段文字,输出为每个句子一行。
模型训练
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import opennlp.tools.sentdetect.SentenceDetectorFactory;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.sentdetect.SentenceSampleStream;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
public class SentenceDetectorTrain {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//训练数据的路径
String filePath = fileResourcesDir + "sentenceDetector.txt";
//训练后模型的保存路径
String modelPath = modelResourcesDir + "da-sent-my.bin";
InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File(filePath));
ObjectStream<String> lineStream = new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8);
//按行读取数据
ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream(lineStream);
SentenceDetectorFactory sentenceFactory=new SentenceDetectorFactory();
SentenceModel model = SentenceDetectorME.train("en", sampleStream, sentenceFactory, TrainingParameters.defaultParams());
OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelPath));
model.serialize(modelOut);
}
}
语句检测
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
public class SentenceDetectorPredit {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
String modelPath = modelResourcesDir + "da-sent.bin";
InputStream modelIn = new FileInputStream(modelPath) ;
//加载模型
SentenceModel model = new SentenceModel(modelIn);
//实例化模型
SentenceDetectorME sentenceDetector = new SentenceDetectorME(model);
//语句检测
String sentences[] = sentenceDetector.sentDetect("First sentence. Second sentence. ");
for(String str:sentences){
System.out.println(str);
}
}
}