NLP学习(六)-词性标注问题

  • Resource averaged_perceptron_tagger not found.是因为找不到词性标注器,ipython执行下列操作即可。
    import nltk
    nltk.download(‘averaged_perceptron_tagger’)

1.词性标注器

import nltk
from nltk.tag import pos_tag #词性标注器
from nltk.tokenize import word_tokenize

text = word_tokenize("And now for something completely different")
print(pos_tag(text))

text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
print(text.similar('woman'))#查找近义词-大部分都是同一词性的词
print(text.similar('bought'))
print(text.similar('over'))
print(text.similar('the'))

2.标注语料库

  • 统一标注集合词性
    在这里插入图片描述
# #标注语料库
tagged_token = nltk.tag.str2tuple('fly/NN') #将字符串转化为元组
print(tagged_token)
print(tagged_token[0])
print(tagged_token[1])

sent = '''
The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN
other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC
Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PP
said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/R
accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT
interest/NN of/IN both/ABX governments/NNS ''/'' ./.
 '''
print([nltk.tag.str2tuple(t) for t in sent.split()])

print(nltk.corpus.brown.tagged_words()) #布朗的标注语料库
print(nltk.corpus.nps_chat.tagged_words())#各种语料库的标注语料库
print(nltk.corpus.conll2000.tagged_words())
print(nltk.corpus.treebank.tagged_words())

print(nltk.corpus.brown.tagged_words(tagset='universal'))#统一标注集合
print(nltk.corpus.nps_chat.tagged_words(tagset='universal'))
print(nltk.corpus.conll2000.tagged_words(tagset='universal'))
print(nltk.corpus.treebank.tagged_words(tagset='universal'))
  • 查找布朗语料库中最常用的词语词性统计
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
print(tag_fd.most_common())
tag_fd.plot()

在这里插入图片描述

3.自动标注

  • 默认标注器(所有词都默认标注为一种词性,这种方法不好,没什么意义,但是对于一些不知道词性新词来说可以默认标注为频数最多的词性,可以猜猜看…)
import nltk
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
print(default_tagger.tag(tokens))


[('I', 'NN'), ('do', 'NN'), ('not', 'NN'), ('like', 'NN'), 
('green', 'NN'), ('eggs', 'NN'), ('and', 'NN'), ('ham', 'NN'), 
(',', 'NN'), ('I', 'NN'), ('do', 'NN'), ('not', 'NN'), ('like', 
'NN'), ('them', 'NN'), ('Sam', 'NN'), ('I', 'NN'), ('am', 'NN'), 
('!', 'NN')]
  • 正则表达式标注器(根据特殊后缀对单词进行词性标注)
# #正则表达式标注器
from nltk.corpus  import brown
patterns = [
    (r'.*ing$', 'VBG'), # gerunds
    (r'.*ed$', 'VBD'), # simple past
    (r'.*es$', 'VBZ'), # 3rd singular present
    (r'.*ould$', 'MD'), # modals
    (r'.*\'s$', 'NN$'), # possessive nouns
    (r'.*s$', 'NNS'), # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
    (r'.*', 'NN') # nouns (default) #默认标注为名词
]
regexp_tagger = nltk.RegexpTagger(patterns)
print(regexp_tagger.tag(brown.sents()[3]))#进行标注

print(regexp_tagger.evaluate(brown.tagged_sents(categories='news')))#检查标注正确率0.20326391789486245
  • 查询标注器(NO)
import nltk
from nltk.corpus  import brown
fd = nltk.FreqDist(brown.words(categories='news'))#频率分布
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))#条件频率分布
most_freq_words = fd.most_common()[:100]#取出100个最常见的词语
likely_tags = dict((word, cfd[word].max()) for (word,freq) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown.tagged_sents(categories='news'))

sent = brown.sents(categories='news')[3]
baseline_tagger.tag(sent)

baseline_tagger = nltk.UnigramTagger(model=likely_tags,backoff=nltk.DefaultTagger('NN'))

def performance(cfd, wordlist):
    lt = dict((word, cfd[word].max()) for word in wordlist)
    baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
    return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))

def display():
    import pylab
    words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
    sizes = 2 ** pylab.arange(15)
    perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
    pylab.plot(sizes, perfs, '-bo')
    pylab.title('Lookup Tagger Performance with Varying Model Size')
    pylab.xlabel('Model Size')
    pylab.ylabel('Performance')
    pylab.show()

display()

在这里插入图片描述

4.N-gram标注(基于统计算法的标注)

  • 一元模型标注
#训练集与测试集相同
import nltk    
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
print(unigram_tagger.tag(brown_sents[2007]))
print(unigram_tagger.evaluate(brown_tagged_sents))#0.9349006503968017

#训练集与测试集不同
# #分离训练与测试数据
import nltk
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
size = int(len(brown_tagged_sents) * 0.9)
print(size)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
print(unigram_tagger.evaluate(test_sents)) #0.8121200039868434
  • N-gram标注

猜你喜欢

转载自blog.csdn.net/hot7732788/article/details/89314179