NLP的文本处理

一、读取文档

#-------------------读取nltk文档--------------------
from nltk.corpus import CategorizedPlaintextCorpusReader
data = CategorizedPlaintextCorpusReader(root='./666', fileids='.*\.txt',
                                        cat_pattern='(\w+)/*', encoding='iso-8859-1')
print(data.words())
print(data.sents(fileids=['neg/6.txt']))

#-------------------读取本地文档--------------------
with open('./data/','r',encoding='utf-8') as f:
    text=f.read()
#或者
text=open("./data/",'r',encoding='utf-8').read()
#去掉 \t \n
text=text.replace('\n',' ').replace('\t',' ').replace('ham',' ')
#或者
text=re.sub("\W",' ',text)   #
#切词放列表
words=text.split()

#------------------使用pandas读取文档----------
impot pandas as pd
#------读取xlsx---
df=pd.read_excel("./date/data_train.xlsx")
print(df)
print(df[0:2])
print(df[['sentence'][0:3]])
for row in df['sentence']:
    print(row)
#-------读取csv---
dfs=pd.read_csv("./data/yelp.csv")
print(dfs)
for row in dfs['text']:
    print(row)

二、文本处理

#去掉停用词
content=open('./stopwords.bat',encoding='utf-8').read()
stopword=content.split('\n')
words=[word for word in words if word not  in stopword]
print(words)
#结巴分词
words=jieba.lcut(sent,cut_all=False)
#词性标注
import jieba.posseg as psg
text=""
print(psg.lcut(text))

三、中文分词

1. 什么是分词

根据语境，将句子以字词为单位划分的过程称之为分词。

2. 为什么需要分词

在英文中，单词之间有空格做天然的分割，分词变得非常简单。而汉语的基本单位是字词，字词是理解句子的基本单位。分词是自然语言处理的基础，分词不好，后面很难去做进一步分析。尽管现在NLP中有很多算法以字来切分，比如bert，中文分词仍然是NLP中很重要、很基础的一块工作。

3. 分词工具

目前，已经有许多开源的中文分词工具，比如jieba，hanlp，pkuseg。这些分词工具都能很好地处理好大部分中文语句的分词工作，所以大部分时候我们不需要重复造轮子。对于一些专业的垂直领域，有些专业常用语可以用增加自定义词典的方式来提高上述分词工具的准确性。

4. 最大匹配分词算法

基于词典的分词方法，比较简单，根据起始匹配位置不同可以分为：

前向最大匹配算法 (FMM，Forward Maximum Matching)
后向最大匹配算法（BMM, Backward Maximum Matching）
双向最大匹配算法

4.1 前向最大匹配算法(FMM)

算法思想：前向最大匹配算法，顾名思义，前向即从左往右取词，取词最大长度为词典中最长词的长度，每次右边减一个字，直到词典中存在或剩下1个单字。

关键点：预先准备好词典；获取词典中最长词的长度；从左到右依次匹配

Python代码实现

#前向最大匹配算法
class FMM:
    def __init__(self):
        self.dict={'我是','我','是','某某','大学','的','大学生','生活','生','学生'}
        self.size=3

    def cut(self,text):
        word_split=[]
        start_index=0
        while start_index<len(text):
            for end_index in range(self.size,0,-1):
                word=text[start_index:start_index+end_index]
                if word in self.dict:
                    word_split.append(word)
                    start_index=start_index+end_index
                    break

        return word_split

fmm=FMM()
words1=fmm.cut("我是某某大学的大学生")
print(words1)

4.2 后向最大匹配算法(BMM)

算法思想：后向最大匹配算法，顾名思义，后向即从右往左取词，取词最大长度为词典中最长词的长度，每次左边减一个字，直到词典中存在或剩下1个单字。

关键点：预先准备好词典；获取词典中最长词的长度；从右到左依次匹配

Python代码实现

#后向最大匹配算法
class BMM:
    def __init__(self):
        self.dict={'我是','我','是','某某','大学','的','大学生','生活','生','学生'}
        self.size=3

    def cut(self,text):
        word_split=[]
        start_index=len(text)
        while start_index>0:
            for end_index in range(self.size,-1,-1):
                word=text[start_index-end_index:start_index]
                if word in self.dict:
                    word_split.append(word)
                    start_index=start_index-end_index
                    break
        word_split.reverse()
        return word_split

bmm=BMM()
words2=bmm.cut("我是某某大学的大学生")
print(words2)

4.3 双向最大匹配

双向最大匹配法：FMM和BMM两种算法都分词一遍，然后根据大颗粒度词越多越好，非词典词和单字词越少越好的原则，选取其中一种分词结果输出。

选择标准：

首先看两种方法结果的分词数，分词数越少越好；
分词数相同的情况下，看单个词的数量，越少越好；

#接上面的代码

#双向最大匹配
l1=len(words1)
l2=len(words2)
if(l1==l2):
    print("前向和后向都可: ",words1)
elif l1>l2:
    print(words1)
else:
    print(words2)

5. viterbi算法

#viterbi算法

plist = {'经常': 2.3, '经': 3, '有': 2.3, '有意见': 2.3, '意见': 1.6, '分歧': 1.6,
         '见': 3, '意': 3, '见分歧': 3,'分': 2.3}

def dag(sentence):
    outlinks = {}
    for i in range(len(sentence)):
        outlinks[i] = []
        for j in range(1, 4):
            if sentence[i:i + j] in plist:
                outlinks[i].append(i + j)
    inlinks={}
    for key,values in outlinks.items():
        for value in values:
            if value in inlinks.keys():
                inlinks[value].append(key)
            else:
                inlinks[value]=[key]

    return outlinks,inlinks

def viterbi(sentense,inLinks):
    f=[0]*(len(sentense)+2)
    p=[0]*(len(sentense)+2)

    for end in range(1,len(sentense)+2):
        #----------------p[start]+plist.get(sentense[start:end],0)
        p[end],f[end]= min([(p[start]+plist.get(sentense[start:end],0),start)  for start in inLinks[end]])

    return f,p

sentense='经常有意见分岐'
outLinks,inLinks=dag(sentense)
print(outLinks)
print(inLinks)
f,p=viterbi(sentense,inLinks)
print(f)
print(p)

四、英文分词

#-------------------英文切词--------------------
#tokenize ，
from  nltk import word_tokenize
from nltk.corpus import stopwords
sentence="Hello world. welcome to Chinam, we are Chinese"
words=word_tokenize(sentence)
#去掉停用词
words=[word for word in words if word not  in stopwords.words('english')  and word not  in string.punctuation]
print(words)
#手写切词
text=open("./data/text.txt",'r',encoding='utf-8').read()
text=re.sub("\W",' ',text)   #
words=text.split()
print(words)
#词性标注
taglist=nltk.pos_tag(tokens)
print(tokens)
sentence=[tokens,tokens]
print(nltk.pos_tag_sents(sentences=sentence))

#-------------词干提取--------------------------
from nltk import PorterStemmer,LancasterStemmer
stemmer=LancasterStemmer()
stemm=PorterStemmer()
print(stemmer.stem('building'))
print(stemmer.stem('built'))
print(stemmer.stem('have'))
print(stemm.stem('building'))
print(stemm.stem('built'))
print(stemm.stem('have'))

#-------------词形还原------------------
from nltk import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
print(lemmatizer.lemmatize('wolves'))
print([lemmatizer.lemmatize(word) for word in words])

五、词袋模型

import re

text='''PM denies knowledge of AWB kickbacks
The Prime Minister has denied he knew AWB was paying kickbacks to Iraq despite writing to the wheat exporter asking to be kept fully informed on Iraq wheat sales.
Letters from John Howard and Deputy Prime Minister Mark Vaile to AWB have been released by the Cole inquiry into the oil for food program.
In one of the letters Mr Howard asks AWB managing director Andrew Lindberg to remain in close contact with the Government on Iraq wheat sales.
'''
text=re.sub("\W",' ',text)   #
words=text.split()

word2id={}
id2word={}
wordbag={}
result_count ={}
text_count = {}

for i in words:
    if i == '':  # 是空就跳过
        continue
    if i not in text_count.keys():  # 遍历所有的key
        text_count[i] = 1  # 如果不存在代表是个新单词，重新+1就行
    else:
        text_count[i] += 1  # 不是新单词就在原来的key的值上+1

# 排序
result = sorted(text_count.items(), key=lambda x: x[1], reverse=True)
for n, s in result:
        result_count[n] = s

i=0
for key,vlue in result_count.items():
    wordbag[key]=[i,vlue]
    word2id[key]=i
    id2word[i]=key
    i+=1

print(wordbag)

raw_documents=['The Prime Minister has denied he knew AWB was paying kickbacks to',
               'Letters from John Howard and Deputy Prime Minister Mark Vaile'             ]

vectors=[]
for sen in raw_documents:
    words=sen.split()
    vector = [0] * len(wordbag)
    for w in words:
        id, count = wordbag[w]
        vector[id] = count
    print(vector)

    vectors.append(vector)

六、Markov

lines=open('./data/postraindata.txt',encoding='utf8').read()
lines=lines.split('\n')
lines=[line.split('/') for line in lines] #列表嵌套列表
words=sorted(set([line[0] for line in lines]))  #单词排序
tags=sorted(set([line[1] for line in lines])) #词性
word2id={}
id2word={}
tag2id={}
id2tag={}
for index,word in enumerate(words):
    word2id[word]=index
    id2word[index]=word

for index,tag in enumerate(tags):
    tag2id[tag]=index
    id2tag[index]=tag

print(word2id)
print(id2word)

import numpy as np
M=len(words)  #词的个数
N=len(tags)  #词性的个数
B=np.zeros((N,M))
A=np.zeros((N,N))
pi=np.zeros(N)
pre_tag=""  #当前词的词性标注的前一个词性标
#-----------算AB矩阵-------------
for line in lines:
    tag,word=tag2id[line[1]],word2id[line[0]]
    B[tag][word]+=1
    if pre_tag=="": #如果pre_tag==""表示句子的开头
        #计算句的首词的词性出现的个数
        pi[tag]+=1       #pi矩阵只跟第一个词性tag有关
    else:
        #计算前一个词性到当前词性的个数
        A[tag2id[pre_tag]][tag]+=1
    if line[1]==".": #表示是句子的开头
        pre_tag=""
    else:
        pre_tag=line[1]
pi=pi/sum(pi)
for i in range(N):
    A[i]=A[i]/sum(A[i])
    B[i]=B[i]/sum(B[i])

def log(v):
    if v==0:
        return np.log(0.0000001)
    else:
        return np.log(v)


def tag_pos(text,A,B,pi):
    words=text.split()
    T=len(words)
    dp=np.zeros((N,T))
    pre = np.zeros((N, T))

    for row in range(N):#0:nn,1:nns,2:dt,3:to,4:adv,...53:adj
        dp[row][0]=log(pi[row])+log(B[row][word2id[words[0]]]) #log(pi)+log(B)
    for col in range(1,T):
        for row in range(N):
            #隐马尔科夫    dp+log(A)+log(B)
            dp[row][col],pre[row][col]= max([(dp[i][col-1]+log(A[i][row])+log(B[row][word2id[words[col]]]),i) for i in range(N)] )




text="Alan Spoon , recently named Newsweek president , said Newsweek 's ad rates would increase 5 % in January ."
tag_pos(text,A,B,pi)

七、vector

## TF-IDF（未平滑）编码进行文本向量化
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer()
raw_documents=['Sajak, 76, has presided over the game show, which features contestants guessing letters to try to fill out words and phrases to win money and prizes, since 1981. He took over duties from Chuck Woolery, who was the show’s first host when it debuted in 1975',
               'Along with Vanna White, who joined the show in 1982, Sajak has been a television mainstay. The show soon shifted to a syndication and aired in the evening in many markets, becoming one of the most successful game shows in history. Sajak will continue to serve as a consultant on the show for three years after his retirement as host',
               'In recent years, some of Sajak’s banter and chiding of contestants have become fodder for social media. That prompted Sajak to remark in his retirement post about doing another season: “(If nothing else, it’ll keep the clickbait sites busy',
               'As the host of Wheel of Fortune, Pat has entertained millions of viewers across America for 40 amazing years. We are incredibly grateful and proud to have had Pat as our host for all these years and we look forward to celebrating his outstanding career throughout the upcoming season, said Suzanne Prete, executive vice president of game shows for Sony Pictures Television',
               'Those scenes that, for almost a half-century, seemed impossible, then more recently started feeling inevitable, finally turned into reality Monday night',
               'The Nuggets outlasted the Miami Heat 94-89 in an ugly, frantic Game 5 that did nothing to derail Nikola Jokic, who bailed out his teammates with 28 points and 16 rebounds on a night when nothing else seemed to work']
vector=vectorizer.fit_transform(raw_documents)
print(vector)
print(vector.todense())
print(vectorizer.vocabulary_)

# -----------One-Hot 编码进行文本向量化------------------------
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)
document_vec = cv.fit_transform(raw_documents)
# 查看词袋和对应向量值
print(cv.get_feature_names())
print(document_vec.toarray())



from gensim.models import Word2Vec
from nltk import word_tokenize

sentence=['Sajak, 76, has presided over the game show, which features contestants guessing letters to try to fill out words and phrases to win money and prizes, since 1981. He took over duties from Chuck Woolery, who was the show’s first host when it debuted in 1975',
               'Along with Vanna White, who joined the show in 1982, Sajak has been a television mainstay. The show soon shifted to a syndication and aired in the evening in many markets, becoming one of the most successful game shows in history. Sajak will continue to serve as a consultant on the show for three years after his retirement as host',
               'In recent years, some of Sajak’s banter and chiding of contestants have become fodder for social media. That prompted Sajak to remark in his retirement post about doing another season: “(If nothing else, it’ll keep the clickbait sites busy',
               'As the host of Wheel of Fortune, Pat has entertained millions of viewers across America for 40 amazing years. We are incredibly grateful and proud to have had Pat as our host for all these years and we look forward to celebrating his outstanding career throughout the upcoming season, said Suzanne Prete, executive vice president of game shows for Sony Pictures Television',
               'Those scenes that, for almost a half-century, seemed impossible, then more recently started feeling inevitable, finally turned into reality Monday night',
               'The Nuggets outlasted the Miami Heat 94-89 in an ugly, frantic Game 5 that did nothing to derail Nikola Jokic, who bailed out his teammates with 28 points and 16 rebounds on a night when nothing else seemed to work']
#切成可迭代的单词

sentences=[word_tokenize(sen.lower()) for sen in sentence]
model=Word2Vec(sentences=sentences,min_count=1,window=2,vector_size=20,sg=1)
print(model.wv['host'])
print(model.wv['scenes'])
#计算相似
print(model.wv.similarity('host','scenes'))

from gensim.models import TfidfModel,LsiModel,LdaModel
from nltk import word_tokenize
from gensim.corpora import Dictionary

text="Like many monuments, the one of Robert E Lee in Richmond required great civic fundraising efforts and creativity over many years toward its final unveiling on May 29th, 1890.  It was inspired by a lithograph of Bavarian artist Adalbert Volck and cast by French sculptor Antonin Mercié.  The newspapers of the time say that 10,000 people helped pull the four wagons carrying pieces of the 12-ton bronze statue of Robert E.Lee mounted on his horse."

document=[te.lower() for te in text]
documents=[word_tokenize(doc) for doc in document]

dic=Dictionary(documents)
copara=[dic.doc2bow(doc) for doc in documents]  #doc2bow(doc)-将每个句子样本表示成向量
#copara可迭代对象
tfmodel=TfidfModel(copara,id2word=dic)
lsimodel=LsiModel(tfmodel[copara],num_topics=3,id2word=dic)
print('-'*50)
print(lsimodel.projection.u)
print(lsimodel.projection.s)
print(list(lsimodel[tfmodel[copara]]))
print(lsimodel.print_topic(0))


ldamodel=LdaModel(tfmodel[copara],num_topics=3,id2word=dic,passes=3)

print(ldamodel.print_topic(0))
print(ldamodel.print_topic(1))