结巴分词,文本聚类,情感分析,词云图可视化

import re
import numpy
import jieba
import jieba.analyse                        # 提取关键内容
import jieba.posseg as pseg                 # 词性标注
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from snownlp import SnowNLP
from scipy.misc import imread
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def load_file():
    '''
        加载外部词典,正则去除所有的标点符号,返回纯文本
    '''
    jieba.load_userdict("G:/anaconda/dict_lzf.txt")       # 加载外部自定义词典
    client = MongoClient('localhost', 27017)  # 链接数据库
    db = client['Taoguba']  # 匹配Taoguba    news = db.Taoguba.find()
    # db = client['Eastmoney']  # 匹配eastmoney    # news = db.Eastmoney.find()
    all_new = []
    N_new = []
    emo = []
    sum = []
    for i in news:
        new = (i["Content"])
        # new = (i["massage"])
        r = '[’@#~%……&*() ——+|}{“?》《,。、、】【!"#$%&\'()*+,-./:; <=>?@[\\]^_`{|}~]+'
        news1 = re.sub(r, '', new)
        news1 = re.sub('[a-zA-Z0-9]', '', news1)
        stop_new = stop_dict(news1)
        cut = cut_package(stop_new)
        N_new.append(cut)
        all_new.append(new)
        emo.append(new)
        s = summary(new)
        sum.append(str(s))
    emo = emotion(sum)
    print("情感相似度如下:")
    print(emo)
    print("情感分类结果如下:")
    sum_number(emo)
    world_Arry = world_arry(N_new)
    cosine_Similarities = cosine_similarities(world_Arry)
    k_data = K_means(cosine_Similarities)
    print("正在打印分类数据:")
    for i in range(5):
        data_arry = numpy.array(all_new)
        print("---------------正在打印第%d---------------" % (i+1))
        data = data_arry[k_data == i]
        ia = []
        for i in data:
            print(i)
            ia.append(i)
            # write_to_DB(i)
    w_cloud(str(N_new))

def write_to_DB(content):
    client = MongoClient('localhost', 27017)  # 链接数据库
    db = client['Taoguba1']                  # 匹配eastmoney    db.Taoguba1.insert({"massage": content})    # 添加值

def stop_dict(news):
    '''
        去除所有的停用词
    '''
    stopwords = open("G:/anaconda/stopwords.txt", 'r', encoding='utf-8').read()
    outstr = ''
    for word in news:
        if word not in stopwords:
            outstr += word
    return outstr

def cut_package(news):
    '''
       按照不同的模式切分
    '''
    seg_list = jieba.cut(news, cut_all=False)         # 精确切割模式(默认为精确模式)
    seg = (' '.join(seg_list))
    return seg

    # seg_list = jieba.cut(news, cut_all=True)         # 全模式
    # print("Full Mode:", ' '.join(seg_list))

    # seg_list = jieba.cut_for_search(news)            # 搜索引擎模式
    # print("Full Mode:", ' '.join(seg_list))

def world_seg(world):
    '''
        词性标注
    '''
    seg_list = pseg.cut(world)
    for w in seg_list:
        # return w.word, w.flag
        print(w.word, w.flag)
        # return w

def find_world(item):
    '''
        关键词提取
    '''
    c = jieba.analyse.extract_tags(item, topK=10, withWeight=True, allowPOS=(('n')))    # 关键词提取,返回10个出现最多的名词
    for i in c:
        print(i)

def tf_idf(corpus):
    '''
        词频,逆文档数
    '''
    vectorizer = CountVectorizer()          # 将文本中的词语转换为词频矩阵
    transformer = TfidfTransformer()        # 该类会统计每个词语的tf-idf权值
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
                    # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
    word = vectorizer.get_feature_names()   # 获取词袋模型中的所有词语
    weight = tfidf.toarray()  # tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
    # write_to_DB(str(weight))
    print(weight)
    # K_means(weight)
    for i in range(len(weight)):  # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
        print(u"-------文本的词语tf-idf权重------")
        for j in range(len(word)):
            print(word[j], weight[i][j])

def world_arry(corpus):
    '''
        词频矩阵
    '''
    vectorizer = CountVectorizer()  # 将文本中的词语转换为词频矩阵
    x = vectorizer.fit_transform(corpus)
    return x

def cosine_similarities(x):
    '''
        余弦相似度统计
    '''
    cosine_similarities = cosine_similarity(x, x)
    return cosine_similarities

def K_means(weight):
    '''
        文档聚类
    '''
    clf = KMeans(n_clusters=5, init='k-means++', random_state=123)
    k_data = clf.fit_predict(weight)
    # print(k_data)
    return k_data

def MiniBatch_KMeans(weight):
    '''
        多文档聚类
    '''
    clf = MiniBatchKMeans(n_clusters=1)
    s = clf.fit(weight)
    print(s)

def emotion(text):
    mood_ = []
    for i in text:
        mood = SnowNLP(i)
        sim_mood = mood.sentiments
        mood_.append(sim_mood)
    return mood_

def summary(txt):
    s = SnowNLP(txt)
    Summary = s.summary(limit=5)
    # print(Summary)
    return Summary

def sum_number(summarry):
    number = []
    for i in summarry:
        if(i > 0.6):
            number.append(1)
        else:
            number.append(0)
    print(number)
    numb = set(number)
    print("情感统计结果如下:")
    for i in numb:
        a = number.count(i)
        print("  %i  一共出现了%a次!" % (i, a))
    if number.count(0) <= number.count(1):
        print("文档偏积极型!")
    else:
        print("文档偏消极型!")

def w_cloud(seg):
    back_coloring = imread("G:/Anaconda/LZF.png")
    # ima_graph = np.array(ima)
    wc = WordCloud(font_path="G:/Anaconda/font/KT_GB2312.ttf",
                   background_color="black", width=2300, height=1000, max_font_size=1000,
                   mask=back_coloring)
    wc.generate(seg)
    # imacolor = ImageColorGenerator(ima_graph)
    # wc.recolor(color_func=imacolor)
    # wc.to_file(r"G:/Anaconda/work/downloads/picture/world_cloud/LZFDF.jpg")
    plt.figure("词云图")
    plt.imshow(wc)
    plt.axis("off")
    plt.show()

def main():
    load_file()

if __name__ == '__main__':
    main()

猜你喜欢

转载自blog.csdn.net/luzaofa/article/details/79712638
今日推荐