python实现协同过滤算法

问题

有若干物品和若干用户,每个用户对其中的部分物品进行了评分,如何使用该评分数据为用户推荐未评分但最合适的物品
数据如下:
每行为一个用户,每列为一个物品

    data_set = mat([[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
                    [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
                    [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
                    [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
                    [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
                    [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
                    [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
                    [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
                    [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
                    [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
                    [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]])

估分算法

对于每个该用户评分的物品:
1、找到同时评价过该物品和待估分物品的用户
2、通过相似度算法计算该物品和待估分物品的相似度
3、将相似度乘以用户对该物品的评分作为估分
将所有相似度相加,将所有估分相加
总相似度 / 总估分为估分

相似度计算方法实现如下:

def cos_sim(in_a, in_b):
    return 0.5 + 0.5 * float(in_a * in_b.T) / linalg.norm(in_a) / linalg.norm(in_b)

更多相似度计算方法:链接
估分算法实现如下:

def estimate_score(data_set, user, item):
    item_amount = shape(data_set)[1]
    total_similarity = 0
    score = 0
    for i in range(item_amount):
        user_rating = data_set[user, i]
        if user_rating == 0:
            continue
        common_items_index = nonzero(logical_and(data_set[:, item] > 0, data_set[:, i] > 0))[0]
        if len(common_items_index) == 0:
            similarity = 0
        else:
            similarity = cos_sim(data_set[common_items_index, item].T, data_set[common_items_index, i].T)
        total_similarity += similarity
        score += similarity * user_rating
    if total_similarity == 0:
        return 0
    else:
        return score / total_similarity

将估分最高的前n个物品推荐给用户,实现如下:

def recommend(data_set, user, item_number):
    un_rated_items = nonzero(data_set[user, :] == 0)[1]
    if len(un_rated_items) == 0:
        return None
    item_scores = list()
    for item in un_rated_items:
        score = estimate_score(data_set, user, item)
        item_scores.append([item, score])
    return sorted(item_scores, key=lambda x: x[1], reverse=True)[:item_number]

使用SVD提高效果

首先将数据集进行svd分解,得到能获得矩阵能量(sigma的乘积)90%的sigma数量。
将数据集重构:数据集转置 * 前sigma列u * 前sigma的对角矩阵:

[[-0.45137416  0.03084799 -0.00290108]
 [-0.36239706  0.02584428 -0.00189127]
 [-0.46879252  0.03296133 -0.00281253]
 [-0.01007685 -0.34024331 -0.22728592]
 [-0.01567036 -0.38750193  0.61197998]
 [-0.01664563 -0.52000097 -0.3608907 ]
 [-0.00474684 -0.18887149 -0.00924222]
 [-0.46712774  0.00389831  0.03349951]
 [-0.47223188  0.02853952 -0.00504059]
 [-0.01591788 -0.39205093  0.55707516]
 [-0.0552444  -0.52034959 -0.36330956]]

每行代表一个物品

对于用户评价的每个物品:
1、计算新数据集中该物品与待评分物品的相似度
2、分数为相似度 * 用户评分
评分为:总评分 / 总相似度
代码实现如下:

def svd_estimate_score(data_set, user, item):
    item_amount = shape(data_set)[1]
    total_similarity = 0
    score = 0
    u, sigmas, v_t = linalg.svd(data_set)
    sigmas_power_2 = sigmas ** 2
    sigma_amount = 1
    while sum(sigmas_power_2[:sigma_amount]) < sum(sigmas_power_2) * 0.9:
        sigma_amount += 1
    sigma_mat = mat(eye(sigma_amount) * sigmas[:sigma_amount])
    new_data_set = data_set.T * u[:, :sigma_amount] * sigma_mat.I
    print(new_data_set)
    for i in range(item_amount):
        user_rating = data_set[user, i]
        if user_rating == 0:
            continue
        similarity = cos_sim(new_data_set[item, :], new_data_set[i, :])
        total_similarity += similarity
        score += similarity * user_rating
    if total_similarity == 0:
        return 0
    else:
        return score / total_similarity

将估分最高的前n个物品推荐给用户,实现如下:

def svd_recommend(data_set, user, item_number):
    un_rated_items = nonzero(data_set[user, :] == 0)[1]
    if len(un_rated_items) == 0:
        return None
    item_scores = list()
    for item in un_rated_items:
        score = svd_estimate_score(data_set, user, item)
        item_scores.append([item, score])
    return sorted(item_scores, key=lambda x: x[1], reverse=True)[:item_number]

示例

from numpy import mat, shape, nonzero, logical_and, linalg, eye


def cos_sim(in_a, in_b):
    return 0.5 + 0.5 * float(in_a * in_b.T) / linalg.norm(in_a) / linalg.norm(in_b)


def estimate_score(data_set, user, item):
    item_amount = shape(data_set)[1]
    total_similarity = 0
    score = 0
    for i in range(item_amount):
        user_rating = data_set[user, i]
        if user_rating == 0:
            continue
        common_items_index = nonzero(logical_and(data_set[:, item] > 0, data_set[:, i] > 0))[0]
        if len(common_items_index) == 0:
            similarity = 0
        else:
            similarity = cos_sim(data_set[common_items_index, item].T, data_set[common_items_index, i].T)
        total_similarity += similarity
        score += similarity * user_rating
    if total_similarity == 0:
        return 0
    else:
        return score / total_similarity


def svd_estimate_score(data_set, user, item):
    item_amount = shape(data_set)[1]
    total_similarity = 0
    score = 0
    u, sigmas, v_t = linalg.svd(data_set)
    sigmas_power_2 = sigmas ** 2
    sigma_amount = 1
    while sum(sigmas_power_2[:sigma_amount]) < sum(sigmas_power_2) * 0.9:
        sigma_amount += 1
    sigma_mat = mat(eye(sigma_amount) * sigmas[:sigma_amount])
    new_data_set = data_set.T * u[:, :sigma_amount] * sigma_mat.I
    for i in range(item_amount):
        user_rating = data_set[user, i]
        if user_rating == 0:
            continue
        similarity = cos_sim(new_data_set[item, :], new_data_set[i, :])
        total_similarity += similarity
        score += similarity * user_rating
    if total_similarity == 0:
        return 0
    else:
        return score / total_similarity


def recommend(data_set, user, item_number):
    un_rated_items = nonzero(data_set[user, :] == 0)[1]
    if len(un_rated_items) == 0:
        return None
    item_scores = list()
    for item in un_rated_items:
        score = estimate_score(data_set, user, item)
        item_scores.append([item, score])
    return sorted(item_scores, key=lambda x: x[1], reverse=True)[:item_number]


def svd_recommend(data_set, user, item_number):
    un_rated_items = nonzero(data_set[user, :] == 0)[1]
    if len(un_rated_items) == 0:
        return None
    item_scores = list()
    for item in un_rated_items:
        score = svd_estimate_score(data_set, user, item)
        item_scores.append([item, score])
    return sorted(item_scores, key=lambda x: x[1], reverse=True)[:item_number]


def main():
    data_set = mat([[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
                    [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
                    [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
                    [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
                    [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
                    [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
                    [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
                    [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
                    [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
                    [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
                    [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]])
    print(recommend(data_set, 1, 3))
    print(svd_recommend(data_set, 1, 3))


if __name__ == '__main__':
    main()

输出:

[[6, 3.3333333333333335], [9, 3.3333333333333335], [0, 3.0]]
[[6, 3.3329499901459845], [9, 3.3315447178728395], [4, 3.3314474877128624]]

猜你喜欢

转载自blog.csdn.net/weixin_43793472/article/details/88656830
今日推荐