分词及词云图设计

1.jieba的基本使用

 1 import jieba
 2 
 3 
 4 s1 = '我喜欢广州小蛮腰'
 5 s2 = "我喜欢上海东方明珠"
 6 #jieba,cut()#默认精准模式
 7 print(10*'-','全模式',10*'-')
 8 r1 = jieba.cut(s1,cut_all=True)#全模式
 9 print(s1)
10 for ele in r1:
11     print(ele)
12 
13 print(10*'-','精准模式',10*'-')
14 r2 = jieba.cut(s1,cut_all=False)
15 print(s1)
16 for ele in r2:
17     print(ele)
18 
19 print(10 * '-', '搜索引擎模式', 10 * '-')
20 r3 = jieba.cut_for_search(s1)#搜索引擎模式
21 for ele in r3:
22     print(ele)
23 
24 #词性标注
25 import jieba.posseg
26 print('---词性标注---')
27 r4 = jieba.posseg.cut(s1)
28 # flag 词性
29 # word 词语
30 for ele in r4:
31     print(ele.word+ele.flag)
32 
33 #词典加载
34 jieba.load_userdict('/home/chen/anaconda3/lib/python3.6/site-packages/jieba/dict.txt')
35 #更改词频
36 print('---更改词频---')
37 word_chg = '上海东方'
38 jieba.add_word(word_chg)
39 jieba.suggest_freq(word_chg,True)
40 
41 #提取关键词
42 import jieba.analyse
43 tag = jieba.analyse.extract_tags(s2,3)
44 print(tag)
45 
46 #返回词语的位置
47 print('---返回词语的位置---')
48 word_index = jieba.tokenize(s2)
49 for ele in word_index:
50     print(ele)
51 print('---以搜索引擎的方式')
52 Word_index = jieba.tokenizes(s2,mode='search')
53 for ele in word_index:
54     print(ele)
55 '''
56 a 形容词
57 c 连词
58 d 副词
59 e 叹词
60 f 方位词
61 i 成语
62 m 数量词
63 n 名词
64 nr 人名
65 ns 地名
66 nt 机构团体
67 nz 其他专有名词
68 p 介词
69 r 代词
70 t 时间
71 u 助词
72 v 动词
73 vn 动名词
74 w 标点
75 un 未知词语
View Code

2.文本挖掘实例

 1 import jieba.analyse
 2 import os
 3 
 4 print(os.path.dirname(os.getcwd()))
 5 file_name = '/home/chen/projects/词云/resource/斗罗大陆.txt'
 6 path = os.path.join(os.path.dirname(os.getcwd()),file_name)
 7 
 8 data = open(file_name).read()
 9 #默认20个关键词
10 tag = jieba.analyse.extract_tags(data,50)#提取50个关键词
11 print(tag)
View Code

3.词云图设计

 1 #收据收集-->分词-->过滤噪音词-->筛选高频词-->生成图
 2 
 3 from wordcloud import WordCloud
 4 import os
 5 import jieba  #中文分词
 6 import numpy as np
 7 import PIL.Image as img #图片处理
 8 import random
 9 
10 
11 stopwords = {"然后":0,"这些":0,"那些":0,"如果":0}
12 cur_path = os.path.dirname(os.path.dirname(__file__)) + "/resource"
13 
14 def chinese_jieba(text):
15     wordlist_jieba=jieba.cut(text)
16     text_jieba=" ".join(wordlist_jieba)
17     return text_jieba
18 
19 with open(os.path.join(cur_path,'test.txt')) as fp:
20     text = fp.read()
21     text = chinese_jieba(text)
22     r_num = random.randrange(0,5)
23     r_num = 4
24     if r_num == 0:
25         bg_img = 'zh.png'
26     elif r_num == 1:
27         bg_img = 'heart.png'
28     elif r_num == 2:
29         bg_img = 'broken_heart.png'
30     elif r_num == 3:
31         bg_img = 'tree.png'
32     else:
33         bg_img = 'love_love.png'
34     #设置背景图
35     mask_pic = np.array(img.open(os.path.join(cur_path,bg_img)))
36     #默认生成方块词云,背景颜色为黑色
37     # wcd = WordCloud().generate(text)
38     #指定背景颜色,指定关键词个数,设置最大字体大小,设置字体,过滤词,背景图形
39     wcd = WordCloud(background_color='white',
40                     #max_words=100,
41                     max_font_size=40,
42                     font_path='JDFZONGYI.ttf',
43                     stopwords=stopwords,
44                     mask=mask_pic).generate(text)
45     image=wcd.to_image()
46     image.show()
View Code

猜你喜欢

转载自www.cnblogs.com/ray-mmss/p/9377090.html