目录
Name Finder
命名查找器可以检测文本中的命名实体和数字。为了能够检测到实体,命名查找器需要一个模型。模型依赖于它被训练的语言和实体类型。OpenNLP项目提供了许多预先培训过的名字查找模型,这些模型在各种免费的语料库上进行了培训。它们可以在我们的模型下载页面下载。要在原始文本中查找名称,必须将文本分成标记和句子。
默认情况下,输入的训练数据中每行一个句子,句子内是经过tokenizer分词的词语。名称实体使用Span进行标记;输入中遇到一个空行表示文档结束。官方建议训练一个模型至少需要15000个句子。如:
<START:person> Pierre Vinken <END> , 61 years old , will join the board as a nonexecutive director Nov. 29 . Mr . <START:person> Vinken <END> is chairman of Elsevier N.V. , the Dutch publishing group . |
OpenNLP定义了一个默认的特性生成,在没有指定自定义特性生成时使用,用户可以通过API自定义特征生成器。
模型训练
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.namefind.TokenNameFinderFactory;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
public class NameFinderTrain {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//训练数据的路径
String filePath = fileResourcesDir + "naneFinder.txt";
//训练后模型的保存路径
String modelPath = modelResourcesDir + "en-ner-person-my.bin";
//按行读取数据
InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File(filePath));
ObjectStream<String> lineStream = new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8);
//按行读取数据
ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
TokenNameFinderFactory factory =new TokenNameFinderFactory();
//训练模型
TokenNameFinderModel model =NameFinderME.train("en","person", sampleStream, TrainingParameters.defaultParams(), factory);
//保存模型
FileOutputStream fos=new FileOutputStream(new File(modelPath));
OutputStream modelOut = new BufferedOutputStream(fos);
model.serialize(modelOut);
//评估模型
TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(new NameFinderME(model));
evaluator.evaluate(sampleStream);
FMeasure result = evaluator.getFMeasure();
System.out.println(result.toString());
}
}
命名识别
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.Span;
public class NameFinderPredit {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//String filePath = fileResourcesDir + "sentenceDetector.txt";
String modelPath = modelResourcesDir + "en-ner-person.bin";
InputStream modelIn = new FileInputStream(modelPath) ;
//加载模型
TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
//实例化模型
NameFinderME nameFinder = new NameFinderME(model);
String tokens[] = new String[]{
"Vinken",
"is",
"61",
"years",
"old",
"Pierre",
".",
"Pierre",
};
//命名检测
//Span 保存表示命名实体在tokens中的位置
Span[] nameFinds= nameFinder.find(tokens);
for(Span str:nameFinds){
System.out.println("type:"+str.getType()+";Tostring:"+str.toString()+";length:"+nameFinds.length+"start:"+str.getStart()+";end:"+str.getEnd()+";name:"+tokens[str.getStart()]);
}
}
}
输出:
type:person;Tostring:[5..6) person;length:2start:5;end:6;name:Pierre
type:person;Tostring:[7..8) person;length:2start:7;end:8;name:Pierre