Factorization Machines简介与代码实现

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_35564813/article/details/85221755

介绍

FM是联合SVM与因式分解模型的优点所得。在有比较大的数据稀疏情况下,也能从中找出联系。FM可以在线性时间内优化。

优点

  1. 可以在非常稀疏的数据中进行合理的参数估计

  2. FM模型的时间复杂度是线性的

  3. FM是一个通用模型,它可以用于任何特征为实值的情况

特征向量例子

在这里插入图片描述

算法原理

  1. model equation:
    在这里插入图片描述
  2. Expressiveness:

对于一个W总是存在 W = V V t W=V·V^t ,也就说对于任何W只要V的列k取得适当,总是能从 W = V V t W=V·V^t 获得。但是在数据非常稀疏的时候,因为没有足够的数据来得到W,那么就可以通过 W = V V t W=V·V^t ,V的k取得足够小来得到W。

  1. Parameter Estimation Under Sparsity:

因为FM的因式分解,打破了变量之间的独立性,使我们可以通过一个交互来估计相关交互的参数

  1. Computation:
    在这里插入图片描述
    对于上述公式,时间复杂度是O(k n 2 n^2 )

但是对于上述公式成对交互可以重新化简为:
在这里插入图片描述
第一步推导可以从下图得出:
在这里插入图片描述
则复杂度变为了O(kn)

FM as Predictor

  • 可以做回归

  • 二分类

  • 排序

上述都可以使用L2正则来优化防止过拟合

Learning FM

在这里插入图片描述
利用梯度来更新

代码实现

简单数据:


import numpy as np 

import tensorflow as tf 

x_data = np.matrix([ 

# Users | Movies | Movie Ratings | Time | Last Movies Rated 

# A B C | TI NH SW ST | TI NH SW ST | | TI NH SW ST 

[1, 0, 0, 1, 0, 0, 0, 0.3, 0.3, 0.3, 0, 13, 0, 0, 0, 0 ], 

[1, 0, 0, 0, 1, 0, 0, 0.3, 0.3, 0.3, 0, 14, 1, 0, 0, 0 ], 

[1, 0, 0, 0, 0, 1, 0, 0.3, 0.3, 0.3, 0, 16, 0, 1, 0, 0 ], 

[0, 1, 0, 0, 0, 1, 0, 0, 0, 0.5, 0.5, 5, 0, 0, 0, 0 ], 

[0, 1, 0, 0, 0, 0, 1, 0, 0, 0.5, 0.5, 8, 0, 0, 1, 0 ], 

[0, 0, 1, 1, 0, 0, 0, 0.5, 0, 0.5, 0, 9, 0, 0, 0, 0 ], 

[0, 0, 1, 0, 0, 1, 0, 0.5, 0, 0.5, 0, 12, 1, 0, 0, 0 ] 

]) 

# ratings 

y_data = np.array([5, 3, 1, 4, 5, 1, 5]) 

# Let's add an axis to make tensoflow happy. 

y_data.shape += (1, ) 

n, p = x_data.shape 

# number of latent factors 

k = 5 

# design matrix 

X = tf.placeholder('float32', [n, p]) 

# target vector 

y = tf.placeholder('float32', [n, 1]) 

# bias and weights 

w0 = tf.Variable(tf.zeros([1])) 

W = tf.Variable(tf.zeros([p])) 

# interaction factors, randomly initialized 

V = tf.Variable(tf.random_normal([k, p], stddev=0.01)) 

# estimate of y, initialized to 0. 

y_hat = tf.Variable(tf.zeros([n, 1])) 

linear_terms = tf.add(w0, 

tf.reduce_sum( 

tf.multiply(W, X), 1, keepdims=True)) 

interactions = (tf.multiply(0.5, 

tf.reduce_sum( 

tf.subtract( 

tf.pow(tf.matmul(X, tf.transpose(V)), 2), 

tf.matmul(tf.pow(X, 2), tf.transpose(tf.pow(V, 2)))), 

1, keepdims=True))) 

y_hat = tf.add(linear_terms, interactions) 

# L2 regularized sum of squares loss function over W and V 

lambda_w = tf.constant(0.001, name='lambda_w') 

lambda_v = tf.constant(0.001, name='lambda_v') 

l2_norm = (tf.reduce_sum( 

tf.add( 

tf.multiply(lambda_w, tf.pow(W, 2)), 

tf.multiply(lambda_v, tf.pow(V, 2))))) 

error = tf.reduce_mean(tf.square(tf.subtract(y, y_hat))) 

loss = tf.add(error, l2_norm) 

eta = tf.constant(0.1) 

optimizer = tf.train.AdagradOptimizer(eta).minimize(loss) 

# that's a lot of iterations 

N_EPOCHS = 1000 

# Launch the graph. 

init = tf.global_variables_initializer() 

with tf.Session() as sess: 

sess.run(init) 

for epoch in range(N_EPOCHS): 

# indices = np.arange(n) 

# np.random.shuffle(indices) 

# x_data, y_data = x_data[indices], y_data[indices] 

sess.run(optimizer, feed_dict={X: x_data, y: y_data}) 

print('MSE: ', sess.run(error, feed_dict={X: x_data, y: y_data})) 

print('Loss (regularized error):', sess.run(loss, feed_dict={X: x_data, y: y_data})) 

print('Predictions:', sess.run(y_hat, feed_dict={X: x_data, y: y_data})) 

print('Learnt weights:', sess.run(W, feed_dict={X: x_data, y: y_data})) 

print('Learnt factors:', sess.run(V, feed_dict={X: x_data, y: y_data})) 

复杂数据版:使用数据来自MovieLens100K Dataset


from scipy.sparse import csr 

import pandas as pd 

import numpy as np 

import tensorflow as tf 

def vectorize_dic(dic,ix=None,p=None,n=0,g=0): 

""" 

dic -- dictionary of feature lists. Keys are the name of features 

ix -- index generator (default None) 

p -- dimension of feature space (number of columns in the sparse matrix) (default None) 

""" 

if ix==None: 

ix = dict() 

nz = n * g 

col_ix = np.empty(nz,dtype = int) 

i = 0 

numofUsers=0 

flag=True 

for k,lis in dic.items(): 

for t in range(len(lis)): 

if k=='users': 

ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k),lis[t]-1) 

elif k=='items': 

if flag==True: 

numofUsers=len(ix) 

flag=False 

ix[str(lis[t]) + str(k)] = lis[t]-1+numofUsers 

col_ix[i+t*g] = ix[str(lis[t]) + str(k)] 

i += 1 

row_ix = np.repeat(np.arange(0,n),g) 

data = np.ones(nz) 

if p == None: 

p = len(ix) 

ixx = np.where(col_ix < p) 

return csr.csr_matrix((data[ixx],(row_ix[ixx],col_ix[ixx])),shape=(n,p)),ix 

#where data, row_ind and col_ind satisfy the relationship a[row_ind[k], col_ind[k]] = data[k]. 

def batcher(X_, y_=None, batch_size=-1): 

n_samples = X_.shape[0] 

if batch_size == -1: 

batch_size = n_samples 

if batch_size < 1: 

raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size)) 

for i in range(0, n_samples, batch_size): 

upper_bound = min(i + batch_size, n_samples) 

ret_x = X_[i:upper_bound] 

if y_ is not None: 

ret_y = y_[i:upper_bound] 

yield (ret_x, ret_y) 

cols = ['user','item','rating','timestamp'] 

train = pd.read_csv('ua.base',delimiter='\t',names = cols) 

test = pd.read_csv('ua.test',delimiter='\t',names = cols) 

x_train,ix = vectorize_dic({'users':train['user'].values, 

'items':train['item'].values},n=len(train.index),g=2) 

x_test,ix = vectorize_dic({'users':test['user'].values, 

'items':test['item'].values},ix,x_train.shape[1],n=len(test.index),g=2) 

y_train = train['rating'].values 

y_test = test['rating'].values 

x_train = x_train.todense() 

x_test = x_test.todense() 

n,p = x_train.shape 

k = 10 

x = tf.placeholder('float',[None,p]) 

y = tf.placeholder('float',[None,1]) 

w0 = tf.Variable(tf.zeros([1])) 

w = tf.Variable(tf.zeros([p])) 

v = tf.Variable(tf.random_normal([k,p],mean=0,stddev=0.01)) 

#y_hat = tf.Variable(tf.zeros([n,1])) 

linear_terms = tf.add(w0,tf.reduce_sum(tf.multiply(w,x),1,keepdims=True)) # n * 1 

pair_interactions = 0.5 * tf.reduce_sum( 

tf.subtract( 

tf.pow( 

tf.matmul(x,tf.transpose(v)),2), 

tf.matmul(tf.pow(x,2),tf.transpose(tf.pow(v,2))) 

),axis = 1 , keepdims=True) 

y_hat = tf.add(linear_terms,pair_interactions) 

lambda_w = tf.constant(0.001,name='lambda_w') 

lambda_v = tf.constant(0.001,name='lambda_v') 

l2_norm = tf.reduce_sum( 

tf.add( 

tf.multiply(lambda_w,tf.pow(w,2)), 

tf.multiply(lambda_v,tf.pow(v,2)) 

) 

) 

error = tf.reduce_mean(tf.square(y-y_hat)) 

loss = tf.add(error,l2_norm) 

train_op = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss) 

epochs = 10 

batch_size = 1000 

# Launch the graph 

init = tf.global_variables_initializer() 

with tf.Session() as sess: 

sess.run(init) 

for epoch in range(epochs): 

perm = np.random.permutation(x_train.shape[0]) 

# iterate over batches 

for bX, bY in batcher(x_train[perm], y_train[perm], batch_size): 

_,t = sess.run([train_op,loss], feed_dict={x: bX.reshape(-1, p), y: bY.reshape(-1, 1)}) 

print(t) 

print('MSE: ', sess.run(error, feed_dict={x: x_test.reshape(-1, p), y: y_test.reshape(-1, 1)})) 

print('Predictions:', sess.run(y_hat, feed_dict={x: x_test.reshape(-1, p), y: y_test.reshape(-1, 1)})) 

猜你喜欢

转载自blog.csdn.net/qq_35564813/article/details/85221755