NLP----神经网络语言模型(NNLM),词向量生成,词嵌入,python实现

理论主要来自论文A Neural Probabilistic Language Model,可以百度到的

这篇博文对理论方面的介绍挺不错的    链接地址

一下是其中的一些截图,主要是算法步骤部分

算法步骤

前向计算

反向更新

个人实现的代码

import glob
import random
import math
import pickle
import numpy as np


# 激活函数
def tanh(o, d):
    x = []
    for i in o:
        x.append(math.tanh(i))
    return x


def get_stopword_list(path):
    """
    载入停用词
    """
    stopword_list = [sw.replace('\n', '')
                     for sw in open(path, 'r', encoding='utf8')]
    return stopword_list


def data_pre(path):
    """
    数据载入,以及完成分词,统计总词数
    """
    import jieba
    content = []
    with open(path, 'r', encoding='gbk', errors='ignore') as f:
        # sw_list = get_stopword_list('./data/stop_words.utf8')
        for l in f:
            l = l.strip()
            if(len(l) == 0):
                continue
            l = [x for x in jieba.cut(l) if x not in get_stopword_list(
                './data/stop_words.utf8')]
            content.append(l)
    return content


# 随机生成词向量并分配词id
def creat_wv(wd, m):
    wd = {i: [random.random() for x in range(m)] for i in wd}
    idd = 0
    wid = {}
    for i in wd:
        wid[i]=wid.get(i,0)+idd
        idd+=1
    # wd['space__']=[random.random() for x in range(m)]
    # wid['space__']=wid.get(i,0)+idd
    return wd,wid


f = glob.glob(r'./data/news/*.txt')
data = []
wd = {}
c = 0
sf = len(f)
for text in f:
    c += 1
    temp = data_pre(text)
    data.extend(temp)
    for t in temp:
        for w in t:
            wd[w] = wd.get(w, 0)+1
    print(text+' complete ', end='')
    print(c/sf)
# print(data)
savedata = np.array(data)
swd = np.array(wd)
np.save('./data/sogo_news.npy',savedata)
np.save('./data/myw2vwd.npy',swd)
# data = np.load('./data/sogo_news.npy').tolist()
# 初始化神经网络
h = 100
v = len(wd)
m = 100
n = 4
win = 2
theta = 0.1 #学习率
# 输入层到隐藏权值,shape=n*m  *  h    n为window的大小,h为隐层神经元个数
H = [[random.random() for j in range(n*m)] for i in range(h)]
H = np.array(H)
d = [random.random() for j in range(h)]  # 隐层偏置 shape=1*h
U = [[random.random() for j in range(h)]
     for i in range(v)]  # 隐层到输出层权值 shape=h*V V为词的总数目
b = [random.random() for j in range(v)]  # 输出层偏置 shape = 1* V
maxtime = 5
sapce = [0 for i in range(m)]  # 空词向量
wvd,wid = creat_wv(wd, m)  # 随机生成词向量和id
sums = len(data)
while(maxtime>0):
    maxtime-=1
    # 训练神经网络
    sm = 0
    for s in data:  # s 是一句话
        aa = (sm+0.0)/sums
        sm+=1
        print('less',end='')
        print(maxtime,end='------------')
        print(aa)
        for w in range(len(s)):  # w是目标词下标
            # 构建输入向量x
            x = []
            inputword = []
            w_id = wid[s[w]]#目标词id
            # w_id2 = []#输入词
            for i in range(w-win, w+win+1):
                # w_id2.append(s[i])
                if i < 0:
                    x.extend(sapce)
                elif i == w:
                    continue
                elif i >= len(s):
                    x.extend(sapce)
                else:
                    x.extend(wvd[s[i]])
                    inputword.append(s[i])

            #---前向计算------------------------
            # 计算隐层输入
            o = np.dot(x, H.T)+d
            # 计算隐层输出
            a = tanh(o, 1)
            a = np.array(a)
            # 计算输出层输入
            U = np.array(U)
            # H = np.array(H)
            y = np.dot(a, U.T)+b
            y = y.tolist()
            # 计算输出
            p = [math.exp(i) for i in y]
            S = sum(p)
            p = [i/S for i in p]
            #----前向计算结束------------------------

            #计算目标函数L
            if p[w_id] !=0:
                L = math.log(p[w_id])
            else:
                L=2.2250738585072014e-200

            #----反向传播------------------------
            la = 0
            lx = 0
            ly = [-i for i in p]
            ly[w_id]+=1
            b  =np.array(b)
            ly = np.array(ly)
            lb = b + theta*ly
            la= ly[0]*U[0]
            for j in range(1,v):
                la+=theta*ly[j]*U[j]
            for j in range(1,v):
                U[j]+=theta*la
            lo = [0 for q in range(len(la))]
            lo=np.array(lo)
            for k in range(h):
                lo[k]=(1-a[k]*a[k])*la[k] 
            lx = np.dot(H.T,lo)
            d +=theta*lo
            x = np.matrix(x)
            lo = np.matrix(lo)
            H += theta*np.dot(lo.T,x)
            x += theta*lx
            x = x.tolist()[0]
            for q in range(len(inputword)):
                a=x[0+i*m:m+i*m]
                for j in range(len(a)):
                    wvd[inputword[q]][j]+=a[j]
            #---反向更新结束
#保存数据
output = open('./data/myw2v.pkl','wb')
pickle.dump(wvd,output)


测试代码

import math
def dis(a,b):
    s = 0
    for i in range(len(a)):
        t=a[i]-b[i]
        t=t*t
        s+=t
    return math.sqrt(s)

import pickle
inputt = open('./data/myw2v.pkl', 'rb') 
wd = pickle.load(inputt)
a = wd['记者']
b = wd['公司']
c = wd['企业']
d = wd['交易']
e = wd['支付']
print(dis(a,b))
print(dis(b,c))
print(dis(e,d))
print(dis(a,e))

猜你喜欢

转载自blog.csdn.net/aaalswaaa1/article/details/84778725
今日推荐