爆Resource averaged_perceptron_tagger not found.
是因为找不到词性标注器,ipython执行下列操作即可。 import nltk nltk.download(‘averaged_perceptron_tagger’)
1.词性标注器
import nltk
from nltk. tag import pos_tag
from nltk. tokenize import word_tokenize
text = word_tokenize( "And now for something completely different" )
print ( pos_tag( text) )
text = nltk. Text( word. lower( ) for word in nltk. corpus. brown. words( ) )
print ( text. similar( 'woman' ) )
print ( text. similar( 'bought' ) )
print ( text. similar( 'over' ) )
print ( text. similar( 'the' ) )
2.标注语料库
统一标注集合词性
tagged_token = nltk. tag. str2tuple( 'fly/NN' )
print ( tagged_token)
print ( tagged_token[ 0 ] )
print ( tagged_token[ 1 ] )
sent = '''
The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN
other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC
Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PP
said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/R
accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT
interest/NN of/IN both/ABX governments/NNS ''/'' ./.
'''
print ( [ nltk. tag. str2tuple( t) for t in sent. split( ) ] )
print ( nltk. corpus. brown. tagged_words( ) )
print ( nltk. corpus. nps_chat. tagged_words( ) )
print ( nltk. corpus. conll2000. tagged_words( ) )
print ( nltk. corpus. treebank. tagged_words( ) )
print ( nltk. corpus. brown. tagged_words( tagset= 'universal' ) )
print ( nltk. corpus. nps_chat. tagged_words( tagset= 'universal' ) )
print ( nltk. corpus. conll2000. tagged_words( tagset= 'universal' ) )
print ( nltk. corpus. treebank. tagged_words( tagset= 'universal' ) )
from nltk. corpus import brown
brown_news_tagged = brown. tagged_words( categories= 'news' , tagset= 'universal' )
tag_fd = nltk. FreqDist( tag for ( word, tag) in brown_news_tagged)
print ( tag_fd. most_common( ) )
tag_fd. plot( )
3.自动标注
默认标注器(所有词都默认标注为一种词性,这种方法不好,没什么意义,但是对于一些不知道词性新词来说可以默认标注为频数最多的词性,可以猜猜看…)
import nltk
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk. word_tokenize( raw)
default_tagger = nltk. DefaultTagger( 'NN' )
print ( default_tagger. tag( tokens) )
[ ( 'I' , 'NN' ) , ( 'do' , 'NN' ) , ( 'not' , 'NN' ) , ( 'like' , 'NN' ) ,
( 'green' , 'NN' ) , ( 'eggs' , 'NN' ) , ( 'and' , 'NN' ) , ( 'ham' , 'NN' ) ,
( ',' , 'NN' ) , ( 'I' , 'NN' ) , ( 'do' , 'NN' ) , ( 'not' , 'NN' ) , ( 'like' ,
'NN' ) , ( 'them' , 'NN' ) , ( 'Sam' , 'NN' ) , ( 'I' , 'NN' ) , ( 'am' , 'NN' ) ,
( '!' , 'NN' ) ]
正则表达式标注器(根据特殊后缀对单词进行词性标注)
from nltk. corpus import brown
patterns = [
( r'.*ing$' , 'VBG' ) ,
( r'.*ed$' , 'VBD' ) ,
( r'.*es$' , 'VBZ' ) ,
( r'.*ould$' , 'MD' ) ,
( r'.*\'s$' , 'NN$' ) ,
( r'.*s$' , 'NNS' ) ,
( r'^-?[0-9]+(.[0-9]+)?$' , 'CD' ) ,
( r'.*' , 'NN' )
]
regexp_tagger = nltk. RegexpTagger( patterns)
print ( regexp_tagger. tag( brown. sents( ) [ 3 ] ) )
print ( regexp_tagger. evaluate( brown. tagged_sents( categories= 'news' ) ) )
import nltk
from nltk. corpus import brown
fd = nltk. FreqDist( brown. words( categories= 'news' ) )
cfd = nltk. ConditionalFreqDist( brown. tagged_words( categories= 'news' ) )
most_freq_words = fd. most_common( ) [ : 100 ]
likely_tags = dict ( ( word, cfd[ word] . max ( ) ) for ( word, freq) in most_freq_words)
baseline_tagger = nltk. UnigramTagger( model= likely_tags)
baseline_tagger. evaluate( brown. tagged_sents( categories= 'news' ) )
sent = brown. sents( categories= 'news' ) [ 3 ]
baseline_tagger. tag( sent)
baseline_tagger = nltk. UnigramTagger( model= likely_tags, backoff= nltk. DefaultTagger( 'NN' ) )
def performance ( cfd, wordlist) :
lt = dict ( ( word, cfd[ word] . max ( ) ) for word in wordlist)
baseline_tagger = nltk. UnigramTagger( model= lt, backoff= nltk. DefaultTagger( 'NN' ) )
return baseline_tagger. evaluate( brown. tagged_sents( categories= 'news' ) )
def display ( ) :
import pylab
words_by_freq = list ( nltk. FreqDist( brown. words( categories= 'news' ) ) )
cfd = nltk. ConditionalFreqDist( brown. tagged_words( categories= 'news' ) )
sizes = 2 ** pylab. arange( 15 )
perfs = [ performance( cfd, words_by_freq[ : size] ) for size in sizes]
pylab. plot( sizes, perfs, '-bo' )
pylab. title( 'Lookup Tagger Performance with Varying Model Size' )
pylab. xlabel( 'Model Size' )
pylab. ylabel( 'Performance' )
pylab. show( )
display( )
4.N-gram标注(基于统计算法的标注)
import nltk
from nltk. corpus import brown
brown_tagged_sents = brown. tagged_sents( categories= 'news' )
brown_sents = brown. sents( categories= 'news' )
unigram_tagger = nltk. UnigramTagger( brown_tagged_sents)
print ( unigram_tagger. tag( brown_sents[ 2007 ] ) )
print ( unigram_tagger. evaluate( brown_tagged_sents) )
import nltk
from nltk. corpus import brown
brown_tagged_sents = brown. tagged_sents( categories= 'news' )
size = int ( len ( brown_tagged_sents) * 0.9 )
print ( size)
train_sents = brown_tagged_sents[ : size]
test_sents = brown_tagged_sents[ size: ]
unigram_tagger = nltk. UnigramTagger( train_sents)
print ( unigram_tagger. evaluate( test_sents) )