文章目录
- CRF 是无向图模型code
- 它是一个判别式模型
- 建模了每个状态和整个观测序列的依赖
https://www.unclewang.info/learn/machine-learning/756/
https://blog.csdn.net/u012421852/article/details/80287567
计算下Z(矩阵)
import numpy as np
y0=1#start
y4=1#stop
从start到stop的所有路径的规范化因子Z,其实就是上面所有路径的非规范化概率之和
class CCRF(object):
"""
条件随机场的矩阵表示实现
"""
def __init__(self,M):
self.M=M#条件随机场的矩阵形式的存储体
self.Z=None#规范化因子
self.MP=[]#矩阵乘积
self.work()
return
def work(self):
print('work......')
self.MP=np.full(shape=(np.shape(self.M[0])),fill_value=1.0)
# print(self.MP)
for i in range(np.shape(self.M)[0]):#四个矩阵就循环四次
print('\nML=\n',self.MP)
print('M%d=\n'%i,self.M[i])
self.MP=np.dot(self.MP,self.M[i])#矩阵乘法
print('dot=\n',self.MP)
def ZValue(self):
return self.MP[0,0]
def CCRF_manual():
M1 = np.array([[0.5, 0.5],[0, 0]])#a01 a02-A
M2 = np.array([[0.3, 0.7],[0.7, 0.3]])#b11,b12,b21,b22-B
M3 = np.array([[0.5, 0.5],[0.6, 0.4]])
M4 = np.array([[1, 0],[1, 0]])
M=[]
M.append(M1)
M.append(M2)
M.append(M3)
M.append(M4)
M=np.array(M)
print('CRF 矩阵:\n',M)
crf=CCRF(M)
ret=crf.ZValue()
print('从start到stop的规范因子Z:',ret)
if __name__=='__main__':
CCRF_manual()
CRF 矩阵:
[[[0.5 0.5]
[0. 0. ]]
[[0.3 0.7]
[0.7 0.3]]
[[0.5 0.5]
[0.6 0.4]]
[[1. 0. ]
[1. 0. ]]]
work......
ML=
[[1. 1.]
[1. 1.]]
M0=
[[0.5 0.5]
[0. 0. ]]
dot=
[[0.5 0.5]
[0.5 0.5]]
ML=
[[0.5 0.5]
[0.5 0.5]]
M1=
[[0.3 0.7]
[0.7 0.3]]
dot=
[[0.5 0.5]
[0.5 0.5]]
ML=
[[0.5 0.5]
[0.5 0.5]]
M2=
[[0.5 0.5]
[0.6 0.4]]
dot=
[[0.55 0.45]
[0.55 0.45]]
ML=
[[0.55 0.45]
[0.55 0.45]]
M3=
[[1. 0.]
[1. 0.]]
dot=
[[1. 0.]
[1. 0.]]
从start到stop的规范因子Z: 1.0
1.1 一般参数形式
import torch
import torch.nn as nn
sequence_len = 3;
y_size = 2;
k=5
l=4
#转移
#每个t有k组,每个y有2种状态,y1->y2,y2->y3,序列长3(k,i=1,2,y_i->y_i+1)
t=torch.tensor( [[[[0,1],[0,0]],[[0,1],[0,0]]],
[[[1,0],[0,0]],[[0,0],[0,0]]],
[[[0,0],[0,0]],[[0,0],[1,0]]],
[[[0,0],[1,0]],[[0,0],[0,0]]],
[[[0,0],[0,0]],[[0,0],[0,1]]]],dtype=float);
lamb=torch.tensor([1,0.5,1,1,0.2])
# 发射
# 序列长3,每个y和x出现的情况(l,t(y),状态)
s=torch.tensor( [[[1,0],[0,0],[0,0]],
[[0,1],[0,1],[0,0]],
[[0,0],[1,0],[1,0]],
[[0,0],[0,0],[0,1]]],dtype=float)
mu = torch.tensor([1, 0.5, 0.8, 0.5])
#比上面多了个开始y0和结束y4
def P_y_x_condition(y):# 参数形式
sumt=0
sums=0
for i in range(k):
for j in range(len(y)-1):
sumt+=lamb[i]*t[i,j,y[j],y[j+1]]
# print(i,j,lamb[i]*t[i,j,y[j],y[j+1]])
for i in range(l):
for j in range(len(y)):
sums+=mu[i]*s[i,j,y[j]]
print(sums+sumt)
return torch.exp(sums+sumt)
y=[0,1,1]
print("p(y|x)=p(y1=1,y2=2,y3=3|x)=",P_y_x_condition(y))
tensor(3.2000, dtype=torch.float64)
p(y|x)=p(y1=1,y2=2,y3=3|x)= tensor(24.5325, dtype=torch.float64)
1.2 简化形式
- 这里也引入了起点,下面代码中引入了起点和终点(只要起点就行)
f=torch.tensor([ [[[0,0],[0,0]],[[0,1],[0,0]],[[0,1],[0,0]],[[0,0],[0,0]]],
[[[0,0],[0,0]],[[1,0],[0,0]],[[0,0],[0,0]],[[0,0],[0,0]]],
[[[0,0],[0,0]],[[0,0],[0,0]],[[0,0],[1,0]],[[0,0],[0,0]]],
[[[0,0],[0,0]],[[0,0],[1,0]],[[0,0],[0,0]],[[0,0],[0,0]]],
[[[0,0],[0,0]],[[0,0],[0,0]],[[0,0],[0,1]],[[0,0],[0,0]]],
[[[1,0],[1,0]],[[0,0],[0,0]],[[0,0],[0,0]],[[0,0],[0,0]]],
[[[0,1],[0,1]],[[0,1],[0,1]],[[0,0],[0,0]],[[0,0],[0,0]]],
[[[0,0],[0,0]],[[1,0],[1,0]],[[1,0],[1,0]],[[0,0],[0,0]]],
[[[0,0],[0,0]],[[0,0],[0,0]],[[0,1],[0,1]],[[0,0],[0,0]]]],dtype=float);
w=torch.tensor([1,0.5,1,1,0.2,1, 0.5, 0.8, 0.5])
def P_y_x_condition_with_f( y):
sum=0
for i in range(k+l):
for j in range(len(y)-1):
sum+=w[i]*f[i,j,y[j],y[j+1]]
print(sum)
return torch.exp(sum)
p_y_x_con=P_y_x_condition_with_f([0,0,1,1,0])
print("p(y|x)=p(y1=1,y2=2,y3=3|x)=",p_y_x_con)
tensor(3.2000, dtype=torch.float64)
p(y|x)=p(y1=1,y2=2,y3=3|x)= tensor(24.5325, dtype=torch.float64)
Z
1.3 矩阵形式
a01表示从start=y0=1到y1=1的概率,
b21表示从y1=2到y2=1的概率
w=torch.tensor([1,0.5,1,1,0.2,1, 0.5, 0.8, 0.5])
M=f;
# print(M[0])
for i in range(k+l):
M[i]=w[i]*f[i]
print(torch.sum(M,axis=0))
M=torch.exp(torch.sum(M,axis=0))
print("M(i,y_i-1,y_i):\n",M)
# 因为y0=0,yn+1=0
# 所以可以令M[0,1,0],M[0,1,1],M[3,0,1],M[3,1,1]=0
# M[0,1,0]=M[0,1,1]=M[3,0,1]=M[3,1,1]=0
# print("M(i,y_i-1,y_i):\n",M)#与上图对应上了
tensor([[[1.0000, 0.5000],
[1.0000, 0.5000]],
[[1.3000, 1.5000],
[1.8000, 0.5000]],
[[0.8000, 1.5000],
[1.8000, 0.7000]],
[[0.0000, 0.0000],
[0.0000, 0.0000]]], dtype=torch.float64)
M(i,y_i-1,y_i):
tensor([[[2.7183, 1.6487],
[2.7183, 1.6487]],
[[3.6693, 4.4817],
[6.0496, 1.6487]],
[[2.2255, 4.4817],
[6.0496, 2.0138]],
[[1.0000, 1.0000],
[1.0000, 1.0000]]], dtype=torch.float64)
1.3.2 Z
从start到stop对应于y=(1,1,1),y=(1,1,2), …, y=(2,2,2)个路径的非规范化概率分别是:
-
a01b11c11,a01b11c12,a01b12c21,a01b12c22
a02b21c11,a01b21c12,a02b22c21,a02b22c22
然后按式11.12求规范化因子,通过计算矩阵乘积M1(x) M2(x) M3(x) M4(x)可知,其第一行第一列的元素为
- a01b11c11+ a01b11c12 + a01b12c21+ a01b12c22 +a02b21c11 + a01b21c12+ a02b22c21 + a02b22c22
恰好等于从start到stop的所有路径的非规范化概率之和,即规范化因子Z(x)。
def Z_M(M):
z=M[0]
for i in range(1,sequence_len+1):
z=torch.matmul(z,M[i])
return z[0,0]
print(Z_M(M))
tensor(253.9492, dtype=torch.float64)
def P_y_x_condition_with_M(y):
p=1;
for i in range(len(y)-1):
p*=M[i,y[i],y[i+1]]
print(p)
return p/Z_M(M)
p_y_x_con=P_y_x_condition_with_M([0,0,1,1,0])
print("p(y|x)=p(y1=1,y2=2,y3=3|x)=",p_y_x_con)
tensor(24.5325, dtype=torch.float64)
p(y|x)=p(y1=1,y2=2,y3=3|x)= tensor(0.0966, dtype=torch.float64)
2.维特比算法
print(torch.log(M))
tensor([[[1.0000, 0.5000],
[1.0000, 0.5000]],
[[1.3000, 1.5000],
[1.8000, 0.5000]],
[[0.8000, 1.5000],
[1.8000, 0.7000]],
[[0.0000, 0.0000],
[0.0000, 0.0000]]], dtype=torch.float64)
def Viterbi_M():
delta=torch.zeros(3,2)
logM=torch.log(M)
delta[0]=logM[0,0]
torch.max(delta[0].reshape(y_size,1)+logM[1],axis=0)
indices=[]
for i in range(1,sequence_len):
print(delta[i-1].reshape(y_size,1)+logM[i])
delta[i],indice=torch.max(delta[i-1].reshape(y_size,1)+logM[i],axis=0)
indices.append(indice)
print(delta)
# print(indices)
path=torch.zeros(sequence_len,dtype=torch.int)
# print(path)
path[sequence_len-1]=torch.argmax(delta[sequence_len-1])
# print(path)
for i in range(sequence_len-2,-1,-1):
path[i]=indices[i][path[i+1]]
# print(path)
return path
Viterbi_M()
tensor([[2.3000, 2.5000],
[2.3000, 1.0000]], dtype=torch.float64)
tensor([[3.1000, 3.8000],
[4.3000, 3.2000]], dtype=torch.float64)
tensor([[1.0000, 0.5000],
[2.3000, 2.5000],
[4.3000, 3.8000]])
tensor([0, 1, 0], dtype=torch.int32)
3.前向算法
- 一般
- 矩阵形式
M[0,1,0]=M[0,1,1]=M[3,0,1]=M[3,1,1]=0
def alpha():
alpha=torch.zeros(sequence_len+2,y_size,dtype=float)
alpha[0,0]=1
for i in range(sequence_len+1):
alpha[i+1]=torch.matmul(alpha[i].reshape(1,y_size),M[i])
print(alpha)
return alpha
alpha=alpha()
tensor([[ 1.0000, 0.0000],
[ 2.7183, 1.6487],
[ 19.9484, 14.9008],
[134.5403, 119.4088],
[253.9492, 0.0000]], dtype=torch.float64)
4.后向算法
def beta():
beta=torch.zeros(sequence_len+2,y_size,dtype=float)
beta[sequence_len+1,0]=1
for i in range(sequence_len,-1,-1):
# print(M[i],beta[i+1].reshape(y_size,1))
beta[i]=torch.matmul(M[i],beta[i+1].reshape(y_size,1)).reshape(y_size)
print(beta)
return beta
beta=beta()
tensor([[253.9492, 0.0000],
[ 60.7485, 53.8707],
[ 6.7072, 8.0634],
[ 1.0000, 1.0000],
[ 1.0000, 0.0000]], dtype=torch.float64)
def Z_alpha(alpha):
return torch.sum(alpha[sequence_len+1])
print(Z_alpha(alpha))
tensor(253.9492, dtype=torch.float64)
def Z_beta(beta):
# print(beta)
return torch.sum(beta[0])
print(Z_beta(betta))
tensor(253.9492, dtype=torch.float64)
5.使用前向后向的概率计算
推导:
def p_y_x_condition_alpha_beta(alpha,beta):
#p(y_i|x)
p_y_x=alpha*beta/Z_alpha(alpha)
# print(alpha[2].reshape(1,y_size)*beta[2].reshape(y_size,1))
return p_y_x
y=[0,1,1]
p_y_x_condition_alpha_beta(alpha,beta)
tensor([[1.0000, 0.0000],
[0.6503, 0.3497],
[0.5269, 0.4731],
[0.5298, 0.4702],
[1.0000, 0.0000]], dtype=torch.float64)
def p_y12_x_condition_alpha_beta(alpha,beta):
p=M.clone().detach()
for i in range(sequence_len+1):
p[i]=alpha[i].reshape(y_size,1)*p[i]*beta[i+1]
return p/Z_alpha(alpha);
p_y12_x_condition_alpha_beta(alpha,beta)
tensor([[[0.6503, 0.3497],
[0.0000, 0.0000]],
[[0.2634, 0.3868],
[0.2634, 0.0863]],
[[0.1748, 0.3520],
[0.3550, 0.1182]],
[[0.5298, 0.0000],
[0.4702, 0.0000]]], dtype=torch.float64)
6.期望计算
def E_fk_py_x(k,alpha,beta):#E_{p(y|x)}(f_k)
return torch.sum(f[k]*p_y12_x_condition_alpha_beta(alpha,beta))
E_fk_py_x(1,alpha,beta)
tensor(0.1317, dtype=torch.float64)
7.参数估计(学习)
7.1 梯度上升
def delta_log_L(self,alpha,beta,y):
# print(self.f[:,3,[0,0,1,1],[0,1,1,0]])
#y=[0,1,1]
delta=torch.sum(self.f[:,len(y),[0]+y,y+[9]],axis=(1))-torch.sum(self.f* self.p_y12_x_condition_alpha_beta(alpha, beta),axis=(1,2,3))
return delta
def predict(self,x):
self.sequence_len = len(x)
self.get_ts(x)
self.M = self.f2M()
return self.Viterbi_M()
def train(self,traindata):
delta=0
batch_size=100
num_batch=int(len(traindata[0])/batch_size)
for e in range(num_batch):
delta=0
for i in range(batch_size):
x = traindata[0][e*batch_size+i]
y = traindata[1][e*batch_size+i]
self.sequence_len =len(x)
# print(x)
self.get_ts(x)
self.M=self.f2M()
alpha = self.alpha()
beta = self.beta()
delta += self.delta_log_L(alpha, beta, y)
print(delta)
print(self.Viterbi_M())
print(y)
self.w = self.w + 0.0001 * delta
-
◼实际上, 梯度上升收敛非常慢
-
⚫ 替代选择:
- ◆ 共轭梯度方法
- ◆ 内存受限拟牛顿法
-
目前的实现速度贼慢……以后再改
参考文献
- 国科大prml课程
- 国科大nlp课程
- 条件随机场CRF(一)从随机场到线性链条件随机场
- 统计学习方法(李航)
- 白板推导CRF
- 一个crf实现(用了他的特征函数)