python应用:主题分类(gensim lda)

安装第三方包:gensim

首先,执行去停词操作(去除与主题无关的词)

 1 #-*-coding:utf8-*-
 2 
 3 import jieba
 4 
 5 def stopwordslist(filepath):
 6     stopwords = [line.strip() for line in open(filepath, 'r').readlines()]
 7     return stopwords
 8 
 9 def seg_sentence(sentence):
10     sentence_seged = jieba.cut(sentence.strip())
11     stopwords = stopwordslist('stopWords/stopwords.txt')
12     outstr = ''
13     for word in sentence_seged:
14         word = word.lower()
15         if word not in stopwords:
16             if word != '\t':
17                 outstr += word
18                 outstr += " "
19     return outstr
20 
21 inputs = open('input/copurs.txt', 'r')   
22 
23 outputs = open('input/copurs_out.txt', 'w')
24 for line in inputs:
25     line_seg = seg_sentence(line)
26     outputs.write(line_seg + '\n')
27 outputs.close()
28 inputs.close()

然后,执行主题分类操作

 1 import codecs
 2 from gensim import corpora
 3 from gensim.models import LdaModel
 4 from gensim import models
 5 from gensim.corpora import Dictionary
 6 
 7 
 8 te = []
 9 fp = codecs.open('input/copurs_out.txt','r')
10 for line in fp:
11     line = line.split()
12     te.append([ w for w in line  ])
13 print len(te) 
14 dictionary = corpora.Dictionary(te)
15 corpus = [ dictionary.doc2bow(text) for text in te ]
16 
17 #tfidf = models.TfidfModel(corpus)
18 #corpus_tfidf = tfidf[corpus]
19 
20 #########Run the LDA model for XX topics ###############################
21 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50,passes=2000) 
22 doc_topic = [a for a in lda[corpus]]
23 
24 ####### write the topics in file topics_result.txt ####################
25 topics_r = lda.print_topics(num_topics = 50, num_words = 10)
26 topic_name = codecs.open('output/topics_result.txt','w')
27 for v in topics_r:
28     topic_name.write(str(v)+'\n')
29 
30     
31 ######################  write the class results to file  #########################
32 ###################### each document belongs to which topic ######################
33 
34 fp2 = codecs.open('output/documents_result.txt','w')
35 for t in doc_topic:
36     c = []
37     c.append([a[1] for a in t])
38     m = max(c[0])
39     
40     for i in range(0, len(t)):
41         if m in t[i]:
42             #print(t[i])
43             fp2.write(str(t[i][0]) + '  ' + str(t[i][1]) + '\n')
44             break
45 ################################ OVER ############################################

注意:上述主题分类,仅使用lda模型(根据频数计算)

也可混合使用tf-idf模型XX-topic下代码改为如下即可:

方式一
#
########Run the LDA model for XX topics ############################### lda = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=50,passes=2000) doc_topic = [a for a in lda[corpus_tfidf]]

方式二
#
########Run the LDA model for XX topics ############################### lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50,passes=2000) doc_topic = [a for a in lda[corpus_tfidf]]

常用方式为方式一,作者暂时为弄清楚这两种方式的区别,后期将会继续完善

 




猜你喜欢

转载自www.cnblogs.com/jpapplication/p/9135713.html