基于评分的商品top-N推荐系统

import io  # needed because of weird encoding of u.item file
import os
from surprise import KNNBaseline
from surprise import Dataset
from surprise import get_dataset_dir
from surprise import Reader
from surprise import dump

def read_item_names(item_file_path,split_flag='\t'):
    """
    从MOVIELNEN 100-K数据集读取UE项目文件并返回两个
    映射将原始ID转换成电影名称和电影名称为原始ID。
    Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    # file_name = r'C:\Users\FELIX\Desktop\surprise库源码分析\uitems.txt'
    file_name=item_file_path
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='utf8') as f:
        for line in f:
            line = line.split(split_flag)
            rid_to_name[line[0]] = line[1].strip()
            name_to_rid[line[1].strip()] = line[0]

    return rid_to_name, name_to_rid

save_path=os.path.expanduser(r'~/dump_file')
def train_data(user_item_score_path,split_flag='\t',user_based=False):
    # path to dataset file
    # 数据集路径
    # file_path = os.path.expanduser(r'C:\Users\FELIX\Desktop\surprise库源码分析\uuu.txt')
    file_path = os.path.expanduser(user_item_score_path)
    reader = Reader(line_format='user item rating timestamp', sep=split_flag)
    data = Dataset.load_from_file(file_path, reader=reader)


    # First, train the algortihm to compute the similarities between items
    # 首先训练算法来计算不同项目之间的相似度
    # data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()
    sim_options = {'name': 'pearson_baseline', 'user_based': user_based}
    algo = KNNBaseline(sim_options=sim_options)
    algo.fit(trainset)
    
    # Dump algorithm and reload it.
#     file_name = os.path.expanduser(r'C:\Users\FELIX\Desktop\surprise库源码分析\uuu.txt\dump_file')
    dump.dump(save_path, algo=algo)    # 模型保存
    

def get_neighbors(item_name,item_file_path,kk=10):
    _, algo = dump.load(save_path) # 模型加载
    # # Read the mappings raw id <-> movie name
    rid_to_name, name_to_rid = read_item_names(item_file_path)
#     print(name_to_rid)
    # # Retrieve inner id of the movie Toy Story
#     item_name_raw_id = name_to_rid['uitems10\n']
    item_name_raw_id = name_to_rid[item_name.strip()]
    
    item_name_inner_id = algo.trainset.to_inner_iid(item_name_raw_id)

    # # Retrieve inner ids of the nearest neighbors of Toy Story.
    item_name_neighbors = algo.get_neighbors(item_name_inner_id, k=kk)

    # Convert inner ids of the neighbors into names.
    item_name_neighbors = (algo.trainset.to_raw_iid(inner_id)
                           for inner_id in item_name_neighbors)
    item_name_neighbors = (rid_to_name[rid]
                           for rid in item_name_neighbors)
    return item_name_neighbors

u_i_path=r'C:\Users\FELIX\Desktop\surprise库源码分析\uuu.txt'
train_data(u_i_path)
i_path=r'C:\Users\FELIX\Desktop\surprise库源码分析\uitems.txt'
nei_items=get_neighbors('uitems685',i_path,kk=10) 
for nei in nei_items:
    print(nei)

如果没有数据的话，可以随机生成测试数据：

# 自己生成数据   1000人   5000商品   1000人，随机对5000个商品中的东西进行评价，评分为1-10
import random
for n in range(4):
    for i in range(1000):
        t=int(random.random()*100)
        for j in range(t):
    #         kk=int(random.random()*200)
    #         for k in range(kk):
                item=int(random.random()*5000)
                goal=int(random.random()*10)
                with open('uu.txt','a',encoding='utf8') as f:
                    line=str(i)+'\t'+str(item)+'\t'+str(goal)+'\t'+'\n'
                    f.write(line)

# 随机打乱评分数据
with open('uu.txt','r',encoding='utf8')as f:
    data=f.readlines()
    data2=random.shuffle(data)
    with open('uuu.txt','a',encoding='utf8')as f2:
        for line in data:
            f2.write(line)

# 随机生成商品数据
with open('uitems.txt','w',encoding='utf8')as f:
    for i in range(5000):
        s=str(i)+'\t'+'uitems{}'.format(str(i))+'\n'
        f.write(s)

基于评分的商品top-N推荐系统

猜你喜欢