问题
有若干物品和若干用户,每个用户对其中的部分物品进行了评分,如何使用该评分数据为用户推荐未评分但最合适的物品
数据如下:
每行为一个用户,每列为一个物品
data_set = mat([[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
[0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
[0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
[3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
[5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
[0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
[4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
[0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
[0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
[0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]])
估分算法
对于每个该用户评分的物品:
1、找到同时评价过该物品和待估分物品的用户
2、通过相似度算法计算该物品和待估分物品的相似度
3、将相似度乘以用户对该物品的评分作为估分
将所有相似度相加,将所有估分相加
总相似度 / 总估分为估分
相似度计算方法实现如下:
def cos_sim(in_a, in_b):
return 0.5 + 0.5 * float(in_a * in_b.T) / linalg.norm(in_a) / linalg.norm(in_b)
更多相似度计算方法:链接
估分算法实现如下:
def estimate_score(data_set, user, item):
item_amount = shape(data_set)[1]
total_similarity = 0
score = 0
for i in range(item_amount):
user_rating = data_set[user, i]
if user_rating == 0:
continue
common_items_index = nonzero(logical_and(data_set[:, item] > 0, data_set[:, i] > 0))[0]
if len(common_items_index) == 0:
similarity = 0
else:
similarity = cos_sim(data_set[common_items_index, item].T, data_set[common_items_index, i].T)
total_similarity += similarity
score += similarity * user_rating
if total_similarity == 0:
return 0
else:
return score / total_similarity
将估分最高的前n个物品推荐给用户,实现如下:
def recommend(data_set, user, item_number):
un_rated_items = nonzero(data_set[user, :] == 0)[1]
if len(un_rated_items) == 0:
return None
item_scores = list()
for item in un_rated_items:
score = estimate_score(data_set, user, item)
item_scores.append([item, score])
return sorted(item_scores, key=lambda x: x[1], reverse=True)[:item_number]
使用SVD提高效果
首先将数据集进行svd分解,得到能获得矩阵能量(sigma的乘积)90%的sigma数量。
将数据集重构:数据集转置 * 前sigma列u * 前sigma的对角矩阵:
[[-0.45137416 0.03084799 -0.00290108]
[-0.36239706 0.02584428 -0.00189127]
[-0.46879252 0.03296133 -0.00281253]
[-0.01007685 -0.34024331 -0.22728592]
[-0.01567036 -0.38750193 0.61197998]
[-0.01664563 -0.52000097 -0.3608907 ]
[-0.00474684 -0.18887149 -0.00924222]
[-0.46712774 0.00389831 0.03349951]
[-0.47223188 0.02853952 -0.00504059]
[-0.01591788 -0.39205093 0.55707516]
[-0.0552444 -0.52034959 -0.36330956]]
每行代表一个物品
对于用户评价的每个物品:
1、计算新数据集中该物品与待评分物品的相似度
2、分数为相似度 * 用户评分
评分为:总评分 / 总相似度
代码实现如下:
def svd_estimate_score(data_set, user, item):
item_amount = shape(data_set)[1]
total_similarity = 0
score = 0
u, sigmas, v_t = linalg.svd(data_set)
sigmas_power_2 = sigmas ** 2
sigma_amount = 1
while sum(sigmas_power_2[:sigma_amount]) < sum(sigmas_power_2) * 0.9:
sigma_amount += 1
sigma_mat = mat(eye(sigma_amount) * sigmas[:sigma_amount])
new_data_set = data_set.T * u[:, :sigma_amount] * sigma_mat.I
print(new_data_set)
for i in range(item_amount):
user_rating = data_set[user, i]
if user_rating == 0:
continue
similarity = cos_sim(new_data_set[item, :], new_data_set[i, :])
total_similarity += similarity
score += similarity * user_rating
if total_similarity == 0:
return 0
else:
return score / total_similarity
将估分最高的前n个物品推荐给用户,实现如下:
def svd_recommend(data_set, user, item_number):
un_rated_items = nonzero(data_set[user, :] == 0)[1]
if len(un_rated_items) == 0:
return None
item_scores = list()
for item in un_rated_items:
score = svd_estimate_score(data_set, user, item)
item_scores.append([item, score])
return sorted(item_scores, key=lambda x: x[1], reverse=True)[:item_number]
示例
from numpy import mat, shape, nonzero, logical_and, linalg, eye
def cos_sim(in_a, in_b):
return 0.5 + 0.5 * float(in_a * in_b.T) / linalg.norm(in_a) / linalg.norm(in_b)
def estimate_score(data_set, user, item):
item_amount = shape(data_set)[1]
total_similarity = 0
score = 0
for i in range(item_amount):
user_rating = data_set[user, i]
if user_rating == 0:
continue
common_items_index = nonzero(logical_and(data_set[:, item] > 0, data_set[:, i] > 0))[0]
if len(common_items_index) == 0:
similarity = 0
else:
similarity = cos_sim(data_set[common_items_index, item].T, data_set[common_items_index, i].T)
total_similarity += similarity
score += similarity * user_rating
if total_similarity == 0:
return 0
else:
return score / total_similarity
def svd_estimate_score(data_set, user, item):
item_amount = shape(data_set)[1]
total_similarity = 0
score = 0
u, sigmas, v_t = linalg.svd(data_set)
sigmas_power_2 = sigmas ** 2
sigma_amount = 1
while sum(sigmas_power_2[:sigma_amount]) < sum(sigmas_power_2) * 0.9:
sigma_amount += 1
sigma_mat = mat(eye(sigma_amount) * sigmas[:sigma_amount])
new_data_set = data_set.T * u[:, :sigma_amount] * sigma_mat.I
for i in range(item_amount):
user_rating = data_set[user, i]
if user_rating == 0:
continue
similarity = cos_sim(new_data_set[item, :], new_data_set[i, :])
total_similarity += similarity
score += similarity * user_rating
if total_similarity == 0:
return 0
else:
return score / total_similarity
def recommend(data_set, user, item_number):
un_rated_items = nonzero(data_set[user, :] == 0)[1]
if len(un_rated_items) == 0:
return None
item_scores = list()
for item in un_rated_items:
score = estimate_score(data_set, user, item)
item_scores.append([item, score])
return sorted(item_scores, key=lambda x: x[1], reverse=True)[:item_number]
def svd_recommend(data_set, user, item_number):
un_rated_items = nonzero(data_set[user, :] == 0)[1]
if len(un_rated_items) == 0:
return None
item_scores = list()
for item in un_rated_items:
score = svd_estimate_score(data_set, user, item)
item_scores.append([item, score])
return sorted(item_scores, key=lambda x: x[1], reverse=True)[:item_number]
def main():
data_set = mat([[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
[0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
[0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
[3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
[5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
[0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
[4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
[0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
[0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
[0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]])
print(recommend(data_set, 1, 3))
print(svd_recommend(data_set, 1, 3))
if __name__ == '__main__':
main()
输出:
[[6, 3.3333333333333335], [9, 3.3333333333333335], [0, 3.0]]
[[6, 3.3329499901459845], [9, 3.3315447178728395], [4, 3.3314474877128624]]