基于LDA模型的邮件主题分类

运行环境：windows10(64bit) + python3.6 + pycharm

Python源代码：

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import pandas as pd
import re
from gensim import corpora, models, similarities
import gensim


# 读邮件
def read_emails(file_path):
    df = pd.read_csv(file_path)
    # pandas.core.frame.DataFrame
    # 取出['Id', 'ExtractedBodyText']两列并去掉漏录入的内容
    df = df[['Id', 'ExtractedBodyText']].dropna()
    return df


# 文本预处理
def clean_email_text(text):
    text = text.replace("\n", " ")  # 新行变空格
    text = re.sub(r'-', ' ', text)  # muyao-studio -> muyao studio
    text = re.sub(r'\d+/\d+/\d+', '', text)  # 去日期
    text = re.sub(r'[0-2]?[0-9]:[0-6][0-9]', '', text)  # 去时间
    text = re.sub(r'[\w]+@[\.\w]+', '', text)  # 去email
    text = re.sub(r'((https|http|ftp)?(:\/\/)?)www\.(([A-Za-z0-9-~]+)\.)+([A-Za-z0-9-~\/])+', '', text)
    pure_text = ''

    # 过滤特殊字符
    for letter in text:
        # 只要字母空格
        if letter.isalpha() or letter == ' ':
            pure_text += letter

    # 去单个的比如don't -> don t 去t
    text = ' '.join(word for word in pure_text.split() if len(word) > 1)
    return text


# 把内容读入doc_list形成列表
def df2list(df):
    docs = df['ExtractedBodyText']
    docs = docs.apply(lambda s: clean_email_text(s))
    # [[email1],[email2],...,[email n]]
    doc_list = docs.values
    return doc_list


# LDA 模型构建
def LDA(doc_list):
    # 1.读停止词
    stopwords = []
    with open('stopwords.txt', 'r', encoding='utf-8') as f:
        for line in f.readlines():
            stopwords.append(line.strip())
    # print(stopwords)

    # 2.英文分词,形成标准gensim语料
    texts = [[word for word in doc.lower().split() if word not in stopwords] for doc in doc_list]
    # print(len(texts))
    # print(texts[0])

    # 3.建立语料库corpus（单词标记化）
    dictionary = corpora.Dictionary(texts)  # id -> word
    corpus = [dictionary.doc2bow(text) for text in texts]  # 变成词袋(213,2) 213号单词出现了2次
    # print(corpus[13])

    # 4.建立LDA模型(点到驼峰表达式才是要用的类) 先人工设定20个主题
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
    for i in range(20):
        print('topic = ' + str(i) + '  ' +  lda.print_topic(i, topn=5))  # 主题i的常用的topn个词
        print('----------------------------------------------')
    return lda


def main():
    file_path = 'emails.csv'
    df = read_emails(file_path)  # 读邮件
    doc_list = df2list(df)  # 形成邮件列表
    lda = LDA(doc_list)  # 主题分类

    # lda.get_document_topics(bow)  # 传入词袋化的文本
    for j in range(100):
        topic_id = lda.get_term_topics(word_id=j)  # 传入词袋化的单词
        print(topic_id)


if __name__ == '__main__':
    main()

运行结果：

topic = 0  0.007*"people" + 0.004*"haitian" + 0.004*"support" + 0.004*"government" + 0.004*"women"
----------------------------------------------
topic = 1  0.009*"good" + 0.006*"sounds" + 0.004*"catholic" + 0.004*"rains" + 0.004*"ran"
----------------------------------------------
topic = 2  0.061*"fyi" + 0.016*"labour" + 0.013*"party" + 0.009*"election" + 0.008*"afghan"
----------------------------------------------
topic = 3  0.009*"cheryl" + 0.008*"pm" + 0.008*"fw" + 0.008*"mills" + 0.006*"saturday"
----------------------------------------------
topic = 4  0.011*"percent" + 0.007*"great" + 0.007*"health" + 0.006*"im" + 0.005*"care"
----------------------------------------------
topic = 5  0.085*"pm" + 0.040*"office" + 0.035*"secretarys" + 0.026*"meeting" + 0.024*"room"
----------------------------------------------
topic = 6  0.006*"print" + 0.006*"book" + 0.005*"pls" + 0.005*"qddr" + 0.004*"diplomacy"
----------------------------------------------
topic = 7  0.008*"obama" + 0.007*"american" + 0.007*"israel" + 0.005*"policy" + 0.005*"israeli"
----------------------------------------------
topic = 8  0.014*"bloomberg" + 0.008*"mod" + 0.005*"vote" + 0.004*"party" + 0.003*"good"
----------------------------------------------
topic = 9  0.009*"president" + 0.007*"mr" + 0.006*"obama" + 0.005*"mcchrystal" + 0.005*"beck"
----------------------------------------------
topic = 10  0.006*"holiday" + 0.005*"note" + 0.004*"iii" + 0.004*"joanne" + 0.004*"roger"
----------------------------------------------
topic = 11  0.013*"party" + 0.006*"tea" + 0.005*"people" + 0.004*"talks" + 0.004*"good"
----------------------------------------------
topic = 12  0.009*"talk" + 0.007*"told" + 0.005*"copies" + 0.005*"argentina" + 0.005*"chicago"
----------------------------------------------
topic = 13  0.022*"clips" + 0.019*"thx" + 0.012*"state" + 0.011*"lona" + 0.011*"doc"
----------------------------------------------
topic = 14  0.005*"palestinians" + 0.004*"government" + 0.003*"claims" + 0.003*"political" + 0.003*"brother"
----------------------------------------------
topic = 15  0.038*"call" + 0.026*"pm" + 0.010*"pis" + 0.010*"pls" + 0.010*"email"
----------------------------------------------
topic = 16  0.009*"tomorrow" + 0.009*"back" + 0.009*"work" + 0.008*"good" + 0.006*"id"
----------------------------------------------
topic = 17  0.007*"logistics" + 0.006*"waldorf" + 0.006*"corker" + 0.005*"water" + 0.005*"china"
----------------------------------------------
topic = 18  0.010*"security" + 0.009*"state" + 0.007*"strategic" + 0.006*"press" + 0.006*"united"
----------------------------------------------
topic = 19  0.015*"state" + 0.006*"department" + 0.005*"case" + 0.005*"marie" + 0.005*"pm"
----------------------------------------------

停止词（stopwords.txt）

able 
about 
above 
according 
accordingly 
across 
actually 
after 
afterwards 
again 
against 
ain’t 
all 
allow 
allows 
almost 
alone 
along 
already 
also 
although 
always 
am 
among 
amongst 
an 
and 
another 
any 
anybody 
anyhow 
anyone 
anything 
anyway 
anyways 
anywhere 
apart 
appear 
appreciate 
appropriate 
are 
aren’t 
around 
as 
a’s 
aside 
ask 
asking 
associated 
at 
available 
away 
awfully 
be 
became 
because 
become 
becomes 
becoming 
been 
before 
beforehand 
behind 
being 
believe 
below 
beside 
besides 
best 
better 
between 
beyond 
both 
brief 
but 
by 
came 
can 
cannot 
cant 
can’t 
cause 
causes 
certain 
certainly 
changes 
clearly 
c’mon 
co 
com 
come 
comes 
concerning 
consequently 
consider 
considering 
contain 
containing 
contains 
corresponding 
could 
couldn’t 
course 
c’s 
currently 
definitely 
described 
despite 
did 
didn’t 
different 
do 
does 
doesn’t 
doing 
done 
don’t 
down 
downwards 
during 
each 
edu 
eg 
eight 
either 
else 
elsewhere 
enough 
entirely 
especially 
et 
etc 
even 
ever 
every 
everybody 
everyone 
everything 
everywhere 
ex 
exactly 
example 
except 
far 
few 
fifth 
first 
five 
followed 
following 
follows 
for 
former 
formerly 
forth 
four 
from 
further 
furthermore 
get 
gets 
getting 
given 
gives 
go 
goes 
going 
gone 
got 
gotten 
greetings 
had 
hadn’t 
happens 
hardly 
has 
hasn’t 
have 
haven’t 
having 
he 
hello 
help 
hence 
her 
here 
hereafter 
hereby 
herein 
here’s 
hereupon 
hers 
herself 
he’s 
hi 
him 
himself 
his 
hither 
hopefully 
how 
howbeit 
however 
i’d 
ie 
if 
ignored 
i’ll 
i’m 
immediate 
in 
inasmuch 
inc 
indeed 
indicate 
indicated 
indicates 
inner 
insofar 
instead 
into 
inward 
is 
isn’t 
it 
it’d 
it’ll 
its 
it’s 
itself 
i’ve 
just 
keep 
keeps 
kept 
know 
known 
knows 
last 
lately 
later 
latter 
latterly 
least 
less 
lest 
let 
let’s 
like 
liked 
likely 
little 
look 
looking 
looks 
ltd 
mainly 
many 
may 
maybe 
me 
mean 
meanwhile 
merely 
might 
more 
moreover 
most 
mostly 
much 
must 
my 
myself 
name 
namely 
nd 
near 
nearly 
necessary 
need 
needs 
neither 
never 
nevertheless 
new 
next 
nine 
no 
nobody 
non 
none 
noone 
nor 
normally 
not 
nothing 
novel 
now 
nowhere 
obviously 
of 
off 
often 
oh 
ok 
okay 
old 
on 
once 
one 
ones 
only 
onto 
or 
other 
others 
otherwise 
ought 
our 
ours 
ourselves 
out 
outside 
over 
overall 
own 
particular 
particularly 
per 
perhaps 
placed 
please 
plus 
possible 
presumably 
probably 
provides 
que 
quite 
qv 
rather 
rd 
re 
really 
reasonably 
regarding 
regardless 
regards 
relatively 
respectively 
right 
said 
same 
saw 
say 
saying 
says 
second 
secondly 
see 
seeing 
seem 
seemed 
seeming 
seems 
seen 
self 
selves 
sensible 
sent 
serious 
seriously 
seven 
several 
shall 
she 
should 
shouldn’t 
since 
six 
so 
some 
somebody 
somehow 
someone 
something 
sometime 
sometimes 
somewhat 
somewhere 
soon 
sorry 
specified 
specify 
specifying 
still 
sub 
such 
sup 
sure 
take 
taken 
tell 
tends 
th 
than 
thank 
thanks 
thanx 
that 
thats 
that’s 
the 
their 
theirs 
them 
themselves 
then 
thence 
there 
thereafter 
thereby 
therefore 
therein 
theres 
there’s 
thereupon 
these 
they 
they’d 
they’ll 
they’re 
they’ve 
think 
third 
this 
thorough 
thoroughly 
those 
though 
three 
through 
throughout 
thru 
thus 
to 
together 
too 
took 
toward 
towards 
tried 
tries 
truly 
try 
trying 
t’s 
twice 
two 
un 
under 
unfortunately 
unless 
unlikely 
until 
unto 
up 
upon 
us 
use 
used 
useful 
uses 
using 
usually 
value 
various 
very 
via 
viz 
vs 
want 
wants 
was 
wasn’t 
way 
we 
we’d 
welcome 
well 
we’ll 
went 
were 
we’re 
weren’t 
we’ve 
what 
whatever 
what’s 
when 
whence 
whenever 
where 
whereafter 
whereas 
whereby 
wherein 
where’s 
whereupon 
wherever 
whether 
which 
while 
whither 
who 
whoever 
whole 
whom 
who’s 
whose 
why 
will 
willing 
wish 
with 
within 
without 
wonder 
won’t 
would 
wouldn’t 
yes 
yet 
you 
you’d 
you’ll 
your 
you’re 
yours 
yourself 
yourselves 
you’ve 
zero 
zt 
ZT 
zz 
ZZ

基于LDA模型的邮件主题分类

猜你喜欢