任务4
1. 推荐的指标的计算
下面实现了3中推荐指标 三种指标的具体定义见任务1
- 召回率
Recall
?
=
∑
u
∣
R
(
u
)
∩
T
(
u
)
∣
∑
u
∣
T
(
u
)
∣
\operatorname{Recall}=\frac{\sum_{u}|R(u) \cap T(u)|}{\sum_{u}|T(u)|}
Recall=∑u?∣T(u)∣∑u?∣R(u)∩T(u)∣? - 准确率
Precision
?
=
∑
u
∣
R
(
u
)
∩
T
(
u
)
∣
∑
u
∣
R
(
u
)
∣
\operatorname{Precision}=\frac{\sum_{u} \mid R(u) \cap T(u)|}{\sum_{u}|R(u)|}
Precision=∑u?∣R(u)∣∑u?∣R(u)∩T(u)∣? - 覆盖率
?Coverage?
=
∣
?
u
∈
U
R
(
u
)
∣
∣
I
∣
\text { Coverage }=\frac{\left|\bigcup_{u \in U} R(u)\right|}{|I|}
?Coverage?=∣I∣∣∣??u∈U?R(u)∣∣??
1.1 召回率
def recall(y_pred, y_true):
"""计算召回率
Args:
y_pred (tensor): 每一行,都对应着给用户推荐的物品的序号(从0开始)
y_true (tensor): 真实的测试集
Returns:
float: 所有用户的召回率的均值
"""
y_pred = torch.scatter(input=torch.zeros_like(y_true), dim=1, index=y_pred, src=torch.ones_like(y_true))
mask = (y_true > 0).float()
common = (y_pred * mask).sum(dim=1)
print(common)
y_true_sum = mask.sum(dim=1)
print(y_true_sum)
return (common / y_true_sum).mean().item()
1.2 准确率
def precision(y_pred, y_true):
"""准确率
Args:
y_pred (tensor): 每一行都是给用户推荐的物品的序号(从0开始)
y_true (tensor): 真实测试集
Returns:
float: 所有用户的准确率的均值
"""
y_pred = torch.scatter(input=torch.zeros_like(y_true), dim=1, index=y_pred, src=torch.ones_like(y_true))
mask = (y_true > 0).float()
common = (y_pred * mask).sum(dim=1)
print(common)
y_pred_sum = y_pred.sum(dim=1)
print(y_pred_sum)
return common / y_pred_sum
1.3 覆盖率
def Coverage(y_pred, items):
"""覆盖率
Args:
y_pred (tensor): _description_
items (int): 物品的数目
Returns:
float: 覆盖率
"""
return len(y_pred.flatten().unique()) / items
2. 相关函数
2.1 LoadData
读取数据集
import numpy as np
import pandas as pd
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch
warnings.filterwarnings('ignore')
def loadData(filepath, name='ml-1m'):
'''
函数说明:
读取打分文件的函数
输入:
filepath:
string类型
数据文件所在的路径,(注意最后必须是以\结尾的)
name:
string类型
数据集的名称,目前只支持ml-1m和ml-100k
输出:
rating_data:
pandas.DataFrame类型
读取到的打分数据
'''
print('开始读取数据')
filepath = filepath + name
if name == 'ml-1m':
filepath = filepath + '/' + 'ratings.dat'
rating_data = pd.read_csv(filepath, sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
elif name == 'ml-100k':
filepath = filepath + '/' + 'u.data'
rating_data = pd.read_csv(filepath, sep='\t', names=['userid', 'movieid', 'rating', 'timestamp'])
else:
raise ValueError('输入的数据集不存在,可选的数据集只有ml-1m, ml-100k')
user_num = rating_data['userid'].max()
movie_num = rating_data['movieid'].max()
print(f'用户人数为: {user_num}')
print(f'电影数目为: {movie_num}')
return rating_data
2.2 rating2matrix
将评分的DataFrame转换成tensor的矩阵
def rating2matrix(ratings, shape=None):
'''
函数说明:
将打分数据转换成一个打分矩阵, 需要注意userid和movieid在进行索引的时候需要减去1
输入:
ratings
Pandas.DataFram
movielens的打分数据,并且要求columns=['userid', 'movieid', 'rating', 'timestamp']
输出:
rating_matrix:
numpy.ndarray
输出的是打分矩阵
'''
print('开始将数据转换成矩阵')
if shape is None:
user_num = ratings['userid'].max()
movie_num = ratings['movieid'].max()
shape = (user_num, movie_num)
print(f'用户人数为: {shape[0]}')
print(f'电影数目为: {shape[1]}')
rating_matrix = np.zeros(shape=shape, dtype=np.float32)
with tqdm(total=len(ratings)) as tq:
for _, row in ratings.iterrows():
row_id = int(row['userid']) - 1
col_id = int(row['movieid']) - 1
rating_matrix[row_id][col_id] = row['rating']
tq.update()
return rating_matrix
2.3 Split_Dataset_P
按照比例进行划分数据集(把每个用户的打分单独进行按照比例划分, 让后整合起来, 保证每个用户都能出现)
def Split_Dataset_P(rating_data, test_size=0.2, group_name='userid', random_state=42, shuffle=True):
'''
函数说明:
把每个用户的打分数据单独挑出来,按照统一的比例进行划分数据集。
这样可以保证每一个用户都出现在训练集当中,不会出现冷启动的问题
输入:
rating_data: pd.DataFrame类型的数据,表示输入的打分数据
test_size : float,测试集所占的比例
userid : string,用户分组的标签名
返回:
train_ratings: pd.DataFrame,训练集数据
test_ratings : pd.DataFrame, 测试集数据
'''
print('开始划分数据集')
user_num = rating_data['userid'].max()
train_ratings = pd.DataFrame(columns=rating_data.columns)
test_ratings = pd.DataFrame(columns=rating_data.columns)
with tqdm(total=user_num) as tq:
for i in rating_data.groupby(group_name):
train_temp, test_temp, _, _ = train_test_split(i[1], i[1], test_size=test_size, shuffle=shuffle, random_state=random_state)
train_ratings = train_ratings.append(train_temp)
test_ratings = test_ratings.append(test_temp)
tq.update()
return train_ratings, test_ratings
3. 进行推荐
这个函数是将矩阵t按照行进行归一化
def normalize_e(t):
return t / t.sum(dim=1).unsqueeze(dim=1)
利用用户打分的相似性对打分进行加权求和, 预测用户的打分.
def predict_score(scores, index, similarity):
r1 = []
similarity = normalize_e(similarly)
for i,j in zip(index, similarity):
selected_score = torch.index_select(input=scores, index=i, dim=0)
final_score = torch.mm(j.unsqueeze(dim=0), selected_score)
r1.append(final_score)
r1 = torch.cat(r1,dim=0)
return r1
利用得到的预测的打分, 从高到低排列, 去除前k个
def recommend(score, k = 3):
pred, index = torch.topk(input=score, k=k, dim=1)
return pred, index
读取数据集
ratings = loadData('./data/', 'ml-1m')
开始读取数据
用户人数为: 6040
电影数目为: 3952
划分数据集
train_data, test_data = Split_Dataset_P(ratings)
开始划分数据集
100%|██████████| 6040/6040 [04:55<00:00, 20.44it/s]
把数据集转换成矩阵
train_data = rating2matrix(train_data)
test_data = rating2matrix(test_data)
开始将数据转换成矩阵
用户人数为: 6040
电影数目为: 3952
100%|██████████| 797758/797758 [01:06<00:00, 12034.77it/s]
开始将数据转换成矩阵
用户人数为: 6040
电影数目为: 3952
100%|██████████| 202451/202451 [00:17<00:00, 11261.42it/s]
计算Jaccards相似性
train_data = torch.tensor(train_data)
simi = Similarity.SimJaccardS(train_data)
利用上面计算得到的相似性来预测打分
pred_score = predict_score(train_data, index=sim_user_index, similarity=sim_user_value)
pred_score.shape
torch.Size([6040, 3952])
把上面得到的打分进行过滤, 去掉用户之前看过的电影
mask = train_data > 0
mask = ~mask
filtered_score = mask * pred_score
利用之前预测得到的分数进行推荐
rec_value, rec_index = recommend(filtered_score, k=10)
对结果进行评估
recall_rate = Metirc.recall(rec_index, test_data)
recall_rate * 100
14.493964612483978
precision = Metirc.precision(rec_index, test_data)
print(precision)
0.3004801273345947
coverage = Metirc.Coverage(rec_index, 1682)
coverage
0.3055885850178359
4. itemCF
只需要将打分矩阵进行转置即可
5. 进阶
不使用矩阵乘法的代码我也实现了, 但是运行速度非常慢, 不推荐. 而且代码非常绕
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
import os
from tqdm import tqdm
warnings.filterwarnings('ignore')
rnames = ['user_id','movie_id','rating','timestamp']
root_path = './data'
ratings = pd.read_csv(os.path.join(root_path, 'ratings.dat'), sep='::', engine='python', names=rnames)
user_nums = ratings['user_id'].max()
print(user_nums)
movie_nums = ratings['movie_id'].max()
print(movie_nums)
query_ratings = np.zeros(shape=(user_nums + 1, movie_nums + 1))
np.save('ratings.npy', query_ratings)
for i in tqdm(ratings.iterrows()):
query_ratings[i[1]['user_id']][i[1]['movie_id']] = i[1]['rating']
ratings.head()
trn_data, val_data, _, _ = train_test_split(ratings, ratings, test_size=0.2, random_state=42)
trn_data.groupby('user_id')['movie_id'].apply(list).reset_index()
trn_data = trn_data.groupby('user_id')['movie_id'].apply(list).reset_index()
val_data = val_data.groupby('user_id')['movie_id'].apply(list).reset_index()
trn_user_items = {}
val_user_items = {}
for user, movies in zip(*(list(trn_data['user_id']), list(trn_data['movie_id']))):
trn_user_items[user] = set(movies)
for user, movies in zip(*(list(val_data['user_id']), list(val_data['movie_id']))):
val_user_items[user] = set(movies)
from tqdm import tqdm
item_users = {}
for uid, items in tqdm(trn_user_items.items()):
for item in items:
if item not in item_users:
item_users[item] = set()
item_users[item].add(uid)
sim = {}
num = {}
for item, users in tqdm(item_users.items()):
for u in users:
if u not in num:
num[u] = 0
num[u] += 1
if u not in sim:
sim[u] = {}
for v in users:
if u != v:
if v not in sim[u]:
sim[u][v] = 0
sim[u][v] += 1
import pickle
with open('sim_dict.pkl', 'wb') as fw:
pickle.dump(sim, fw)
with open('sim_dict.pkl', 'rb') as fr:
sim = pickle.load(fr)
print(type(sim))
with open('num.pkl', 'rb') as fr:
num = pickle.load(fr)
print(type(num))
with open('num.pkl', 'wb') as fw:
pickle.dump(num, fw)
trn_data.to_csv('traindata.csv')
trn_data = pd.read_csv('traindata.csv')
trn_data.head()
val_data.to_csv('valdata.csv')
val_data = pd.read_csv('valdata.csv')
val_data.head()
import math
for u, users in tqdm(sim.items()):
for v, score in users.items():
sim[u][v] = score / math.sqrt(num[u] * num[v])
K = 80
N = 10
item_rank = {}
item_scores = {}
for u, _ in tqdm(val_user_items.items()):
item_rank[u] = {}
item_scores[u] = {}
for v, score in sorted(sim[u].items(), key=lambda x: x[1], reverse=True)[:K]:
for item in trn_user_items[v]:
if item not in trn_user_items[u]:
if item not in item_rank[u]:
item_rank[u][item] = 0
item_scores[u][item] = 0
user_rating = query_ratings[v][item]
item_rank[u][item] += score * user_rating
item_scores[u][item] += score
normal_sim = {}
for u, _ in tqdm(val_user_items.items()):
temp = 0
for I, temp2 in sorted(sim[u].items(), key=lambda x: x[1], reverse=True)[:K]:
temp += temp2
normal_sim[u] = temp
K = 80
N = 10
item_rank = {}
for u, _ in tqdm(val_user_items.items()):
item_rank[u] = {}
for v, score in sorted(sim[u].items(), key=lambda x: x[1], reverse=True)[:K]:
for item in trn_user_items[v]:
if item not in trn_user_items[u]:
if item not in item_rank[u]:
item_rank[u][item] = 0
user_rating = query_ratings[v][item]
item_rank[u][item] += (score / normal_sim[u]) * user_rating
sorted(sim[1].items(), key=lambda x: x[1], reverse=True)[:10]
item_rank[1]
for u, _ in tqdm(item_rank.items()):
for item, temp in item_rank[u].items():
item_rank[u][item] = temp / item_scores[u][item]
item_rank[1]
item_scores[1][3712]
K = 80
N = 10
sl = []
item_rank = {}
for u, _ in tqdm(val_user_items.items()):
item_rank[u] = {}
for v, score in sorted(sim[u].items(), key=lambda x: x[1], reverse=True)[:K]:
for item in trn_user_items[v]:
if item not in trn_user_items[u]:
if item not in item_rank[u]:
item_rank[u][item] = 0
item_rank[u][item] += score
sl.append(score)
for u, _ in tqdm(val_user_items.items()):
for v, score in sorted(sim[u].items(), key=lambda x: x[1], reverse=True)[:K]:
print(u, v)
print(score)
break
break
item_rank2 = {k: sorted(v.items(), key=lambda x: x[1], reverse=True)[:N] for k, v in item_rank.items()}
item_rank2 = {k: set([x[0] for x in v]) for k, v in item_rank2.items()}
item_rank2
def Recall(Rec_dict, Val_dict):
'''
Rec_dict: 推荐算法返回的推荐列表, 形式:{uid: {item1, item2,...}, uid: {item1, item2,...}, ...}
Val_dict: 用户实际点击的商品列表, 形式:{uid: {item1, item2,...}, uid: {item1, item2,...}, ...}
'''
hit_items = 0
all_items = 0
for uid, items in Val_dict.items():
rel_set = items
rec_set = Rec_dict[uid]
for item in rec_set:
if item in rel_set:
hit_items += 1
all_items += len(rel_set)
return round(hit_items / all_items * 100, 2)
def Precision(Rec_dict, Val_dict):
'''
Rec_dict: 推荐算法返回的推荐列表, 形式:{uid: {item1, item2,...}, uid: {item1, item2,...}, ...}
Val_dict: 用户实际点击的商品列表, 形式:{uid: {item1, item2,...}, uid: {item1, item2,...}, ...}
'''
hit_items = 0
all_items = 0
for uid, items in Val_dict.items():
rel_set = items
rec_set = Rec_dict[uid]
for item in rec_set:
if item in rel_set:
hit_items += 1
all_items += len(rec_set)
return round(hit_items / all_items * 100, 2)
def Coverage(Rec_dict, Trn_dict):
'''
Rec_dict: 推荐算法返回的推荐列表, 形式:{uid: {item1, item2,...}, uid: {item1, item2,...}, ...}
Trn_dict: 训练集用户实际点击的商品列表, 形式:{uid: {item1, item2,...}, uid: {item1, item2,...}, ...}
'''
rec_items = set()
all_items = set()
for uid in Rec_dict:
for item in Trn_dict[uid]:
all_items.add(item)
for item in Rec_dict[uid]:
rec_items.add(item)
return round(len(rec_items) / len(all_items) * 100, 2)
def Popularity(Rec_dict, Trn_dict):
'''
Rec_dict: 推荐算法返回的推荐列表, 形式:{uid: {item1, item2,...}, uid: {item1, item2,...}, ...}
Trn_dict: 训练集用户实际点击的商品列表, 形式:{uid: {item1, item2,...}, uid: {item1, item2,...}, ...}
'''
pop_items = {}
for uid in Trn_dict:
for item in Trn_dict[uid]:
if item not in pop_items:
pop_items[item] = 0
pop_items[item] += 1
pop, num = 0, 0
for uid in Rec_dict:
for item in Rec_dict[uid]:
pop += math.log(pop_items[item] + 1)
num += 1
return round(pop / num, 3)
def rec_eval(val_rec_items, val_user_items, trn_user_items):
print('recall:',Recall(val_rec_items, val_user_items))
print('precision',Precision(val_rec_items, val_user_items))
print('coverage',Coverage(val_rec_items, trn_user_items))
print('Popularity',Popularity(val_rec_items, trn_user_items))
rec_eval(item_rank2, val_user_items, trn_user_items)
item_rank2[1]
item_rank2[1]
val_user_items[1]
sorted(item_rank[1].items(), key=lambda x: x[1], reverse=True)
|