Hands-Natural-language-processing-python 1: NLTK

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/QFire/article/details/84862942

基本用法:

>>> from nltk.tokenize import word_tokenize as wtoken
>>> wtoken(samples_tw[20])
>>> from nltk.stem import PorterStemmer
>>> stemming = PorterStemmer()
>>> stemming.stem('enjoying')
'enjoy'
>>> stemming.stem('enjoys')
'enjoy'
>>> stemming.stem('enjoyable')
'enjoy'
>>> from nltk.corpus import stopwords
>>> sw_l = stopwords.words('english')
>>> sw_l[20:40]
['himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this']
>>> example_text = "This is an example sentence to test stopwords"
>>> example_text_without_stopwords = [word for word in example_text.split() if word not in sw_l]
>>> example_text_without_stopwords
['This', 'example', 'sentence', 'test', 'stopwords']
>>> from nltk.corpus import webtext
>>> webtext_sentences = webtext.sents('firefox.txt')
>>> webtext_words = webtext.words('firefox.txt')
>>> len(webtext_sentences)
1142
>>> len(webtext_words)
102457
>>> vocabulary = set(webtext_words)
>>> len(vocabulary)
8296
>>> frequency_dist = nltk.FreqDist(webtext_words)
>>> sorted(frequency_dist, key=frequency_dist.__getitem__, reverse=True)[0:30]
['.', 'in', 'to', '"', 'the', "'", 'not', '-', 'when', 'on', 'a', 'is', 't', 'and', 'of', '(', 'page', 'for', 'with', ')', 'window', 'Firefox', 'does', 'from', 'open', ':', 'menu', 'should', 'bar', 'tab']
>>> large_words = dict([(k,v) for k,v in frequency_dist.items() if len(k)>3])
>>> frequency_dist = nltk.FreqDist(large_words)
>>> frequency_dist.plot(50, cumulative=False)
wcloud = WordCloud().generate_from_frequencies(frequency_dist)
import matplotlib.pyplot as plt
plt.imshow(wcloud, interpolation='bilinear')
<matplotlib.image.AxesImage object at 0x000000000DED65F8>
plt.axis('off')
(-0.5, 399.5, 199.5, -0.5)
plt.show()

猜你喜欢

转载自blog.csdn.net/QFire/article/details/84862942