import re import numpy import jieba import jieba.analyse # 提取关键内容 import jieba.posseg as pseg # 词性标注 from pymongo import MongoClient from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer from sklearn.cluster import KMeans, MiniBatchKMeans from sklearn.metrics.pairwise import cosine_similarity from snownlp import SnowNLP from scipy.misc import imread from wordcloud import WordCloud import matplotlib.pyplot as plt def load_file(): ''' 加载外部词典,正则去除所有的标点符号,返回纯文本 ''' jieba.load_userdict("G:/anaconda/dict_lzf.txt") # 加载外部自定义词典 client = MongoClient('localhost', 27017) # 链接数据库 db = client['Taoguba'] # 匹配Taoguba表 news = db.Taoguba.find() # db = client['Eastmoney'] # 匹配eastmoney表 # news = db.Eastmoney.find() all_new = [] N_new = [] emo = [] sum = [] for i in news: new = (i["Content"]) # new = (i["massage"]) r = '[’!@#~¥%……&*() ——+|}{“:”?》《,。、‘;’、】【!"#$%&\'()*+,-./:; <=>?@[\\]^_`{|}~]+' news1 = re.sub(r, '', new) news1 = re.sub('[a-zA-Z0-9]', '', news1) stop_new = stop_dict(news1) cut = cut_package(stop_new) N_new.append(cut) all_new.append(new) emo.append(new) s = summary(new) sum.append(str(s)) emo = emotion(sum) print("情感相似度如下:") print(emo) print("情感分类结果如下:") sum_number(emo) world_Arry = world_arry(N_new) cosine_Similarities = cosine_similarities(world_Arry) k_data = K_means(cosine_Similarities) print("正在打印分类数据:") for i in range(5): data_arry = numpy.array(all_new) print("---------------正在打印第%d类---------------" % (i+1)) data = data_arry[k_data == i] ia = [] for i in data: print(i) ia.append(i) # write_to_DB(i) w_cloud(str(N_new)) def write_to_DB(content): client = MongoClient('localhost', 27017) # 链接数据库 db = client['Taoguba1'] # 匹配eastmoney表 db.Taoguba1.insert({"massage": content}) # 添加值 def stop_dict(news): ''' 去除所有的停用词 ''' stopwords = open("G:/anaconda/stopwords.txt", 'r', encoding='utf-8').read() outstr = '' for word in news: if word not in stopwords: outstr += word return outstr def cut_package(news): ''' 按照不同的模式切分 ''' seg_list = jieba.cut(news, cut_all=False) # 精确切割模式(默认为精确模式) seg = (' '.join(seg_list)) return seg # seg_list = jieba.cut(news, cut_all=True) # 全模式 # print("Full Mode:", ' '.join(seg_list)) # seg_list = jieba.cut_for_search(news) # 搜索引擎模式 # print("Full Mode:", ' '.join(seg_list)) def world_seg(world): ''' 词性标注 ''' seg_list = pseg.cut(world) for w in seg_list: # return w.word, w.flag print(w.word, w.flag) # return w def find_world(item): ''' 关键词提取 ''' c = jieba.analyse.extract_tags(item, topK=10, withWeight=True, allowPOS=(('n'))) # 关键词提取,返回10个出现最多的名词 for i in c: print(i) def tf_idf(corpus): ''' 词频,逆文档数 ''' vectorizer = CountVectorizer() # 将文本中的词语转换为词频矩阵 transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值 tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵 word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语 weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 # write_to_DB(str(weight)) print(weight) # K_means(weight) for i in range(len(weight)): # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 print(u"-------文本的词语tf-idf权重------") for j in range(len(word)): print(word[j], weight[i][j]) def world_arry(corpus): ''' 词频矩阵 ''' vectorizer = CountVectorizer() # 将文本中的词语转换为词频矩阵 x = vectorizer.fit_transform(corpus) return x def cosine_similarities(x): ''' 余弦相似度统计 ''' cosine_similarities = cosine_similarity(x, x) return cosine_similarities def K_means(weight): ''' 文档聚类 ''' clf = KMeans(n_clusters=5, init='k-means++', random_state=123) k_data = clf.fit_predict(weight) # print(k_data) return k_data def MiniBatch_KMeans(weight): ''' 多文档聚类 ''' clf = MiniBatchKMeans(n_clusters=1) s = clf.fit(weight) print(s) def emotion(text): mood_ = [] for i in text: mood = SnowNLP(i) sim_mood = mood.sentiments mood_.append(sim_mood) return mood_ def summary(txt): s = SnowNLP(txt) Summary = s.summary(limit=5) # print(Summary) return Summary def sum_number(summarry): number = [] for i in summarry: if(i > 0.6): number.append(1) else: number.append(0) print(number) numb = set(number) print("情感统计结果如下:") for i in numb: a = number.count(i) print(" %i 一共出现了%a次!" % (i, a)) if number.count(0) <= number.count(1): print("文档偏积极型!") else: print("文档偏消极型!") def w_cloud(seg): back_coloring = imread("G:/Anaconda/LZF.png") # ima_graph = np.array(ima) wc = WordCloud(font_path="G:/Anaconda/font/KT_GB2312.ttf", background_color="black", width=2300, height=1000, max_font_size=1000, mask=back_coloring) wc.generate(seg) # imacolor = ImageColorGenerator(ima_graph) # wc.recolor(color_func=imacolor) # wc.to_file(r"G:/Anaconda/work/downloads/picture/world_cloud/LZFDF.jpg") plt.figure("词云图") plt.imshow(wc) plt.axis("off") plt.show() def main(): load_file() if __name__ == '__main__': main()
结巴分词,文本聚类,情感分析,词云图可视化
猜你喜欢
转载自blog.csdn.net/luzaofa/article/details/79712638
今日推荐
周排行