推荐系统–基于用户的协同过滤算法(UserCF)
基本概念
基本思想:向用户
u
u
u 推荐时,我们可以先找到和
u
u
u 相似的用户集合
N
u
N_u
Nu?,然后把这些用户喜欢的物品,但
u
u
u 没有看过的物品推荐给他。
步骤:
- 找到和目标用户相似的用户集合。
- 找到这个集合中的用户喜欢的,且目标用户还没有听说过的物品,然后推荐给目标用户。
步骤1 - - 找相似用户
为了找到目标用户的相似用户,我们需要计算不同用户之间的相似度。协同过滤算法主要利用行为的相似度计算兴趣的相似度。对于用户
u
u
u 和
v
v
v,
N
u
,
N
v
N_u, N_v
Nu?,Nv? 分别表示他们有交互的物品集合。我们通过 Jaccard 公式计算用户之间的相似度:
s
i
m
(
u
,
v
)
=
∣
N
u
∩
N
v
∣
∣
N
u
∪
N
v
∣
sim_{(u,v)}=\frac{\vert N_u \cap N_v \vert}{\vert N_u \cup N_v \vert}
sim(u,v)?=∣Nu?∪Nv?∣∣Nu?∩Nv?∣? 同样,我们也可以利用余弦相似度、皮尔逊系数来计算用户之间的相似度。具体可见 推荐系统–协同过滤(Collaborative Filtering)
具体案例
我们以电影评分预测为例,数据集为 MovieLens-1M,有关数据集详情,可见 推荐系统–MovieLens数据集。
读取数据集
def get_data(data_path):
col_names = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(os.path.join(data_path, "ratings.dat"), sep="::", engine="python", names=col_names)
"""
ratings 的格式为
user_id movie_id rating timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
"""
train_data, val_data, _, _ = train_test_split(ratings, ratings, test_size=0.2)
train_data = train_data.groupby("user_id")["movie_id"].apply(list).reset_index()
"""
train_data 的格式为
user_id movie_id
0 1 [1097, 1566, 3114, 2797, 48, 661, 1197, 2687, ...
1 2 [1954, 1217, 1265, 498, 1196, 3256, 434, 2002,...
2 3 [3534, 2167, 2871, 648, 2081, 1197, 1580, 1379...
3 4 [2947, 1214, 1387, 3418, 1210, 1198, 1201, 346...
4 5 [3260, 1610, 2952, 1268, 202, 2384, 3624, 3418...
"""
val_data = val_data.groupby("user_id")["movie_id"].apply(list).reset_index()
train_user_item = {}
val_user_item = {}
for user_id, movie in zip(*(list(train_data["user_id"]), list(train_data["movie_id"]))):
train_user_item[user_id] = set(movie)
"""
转为字典格式之后,就是我们需要的输入格式
{
1:{1, 1028, 1029, 1287, 1545, 1035, 783, 527, 914, 531, 661, 150, 919, 1566, 3105,
2340, 1961, 3114, 1193, 1962, 1197, 2355, 1721, 1097, 2762, 588, 720, 3408, 595,
1246, 2398, 608, 2018, 2918, 2791, 745, 2028, 2797, 3186, 1907, 2804, 1270, 2294}
2:{...},
...
}
"""
for user_id, movie in zip(*(list(val_data["user_id"]), list(val_data["movie_id"]))):
val_user_item[user_id] = set(movie)
return train_user_item, val_user_item
建立倒查表 items–>users
我们根据训练集合中用户和物品的交互信息建立倒查表 items_users。
倒排表的格式为:{item_id1:{user_id1, user_id2,…}, item_id2:{user_id1,…},…} ,它表示每个物品都与哪些用户有交互,建立倒排表的目的是为了更好的统计用户之间的共同交互的物品数量。
def item_user_list(train_user_item):
print("建立倒排表....")
items_users = {}
for user_id, items in tqdm(train_user_item.items()):
for item in items:
if item not in items_users:
items_users[item] = set()
items_users[item].add(user_id)
return items_users
建立协同过滤矩阵
根据倒排表 items_users 来统计用户之间共同交互的物品数量。
协同过滤矩阵的形式为: {user_id1:{user_id2:num1, user_id3:num2}, user_id2:{user_id1:num1, user_id3:num2},…},它是一个双层字典,表示不同用户之间共同交互的物品数量。
在计算协同过滤矩阵的同时,还要记录每个用户所交互的物品数量,形式为:num = {user_id1:num1, user_id2:num2,…}
def CollaborativeFilterMatrix(train_user_item, items_users):
CFMatrix = {}
num = {}
print("构建协同过滤矩阵....")
for item, users in tqdm(items_users.items()):
for u in users:
if u not in num:
num[u] = 0
num[u] += 1
if u not in CFMatrix:
CFMatrix[u] = {}
for v in users:
if v != u:
if v not in CFMatrix[u]:
CFMatrix[u][v] = 0
CFMatrix[u][v] += 1
return CFMatrix, num
"""
CFMatrix[1] = {6:8, 8:7, ....} 表示用户 1 与用户 6 有8个共同交互物品
num[1] = 43 表示用户 1 共与 43 个物品有交互
"""
计算相似度矩阵
我们使用余弦相似度来计算用户之间的相似度。用户的协同过滤矩阵可以看作余弦相似度的分子部分,还需要处理分母,分母就是两个用户分别交互物品数量的乘积,每个用户所交互物品的个数在num字典中保存。
def ComputeSimilarity(CFMatrix, num):
sim = CFMatrix
print("构建用户相似度矩阵....")
for u, other_user in tqdm(CFMatrix.items()):
for v, score in other_user.items():
sim[u][v] = sim[u][v] / np.sqrt(num[u] * num[v])
return sim
"""
sim[1] = {6: 0.14333552726125529, 8: 0.07106690545187015,...} 表示用户 1 与用户 6 的相似度为 0.14333552726125529
"""
步骤2 - - 进行 Top-N 推荐
根据与用户
u
u
u 相似的前
K
K
K 个用户的喜好为
u
u
u 进行 Top-N 推荐
首先,需要根据用户相似度矩阵得到与当前用户最相似的前 K 个用户,然后对这 K 个用户所交互物品集中但当前用户并未交互过的物品计算相似度分数,最终推荐的候选物品的相似度分数是由多个用户对该物品分数的一个累加和。
def RecForUser(sim, train_user_item, val_user_item, K, N):
print("给测试用户进行推荐....")
items_rank = {}
for u, _ in tqdm(val_user_item.items()):
items_rank[u] = {}
for v, score in sorted(sim[u].items(), key=lambda x:x[1], reverse=True)[:K]:
for item in train_user_item[v]:
if item in train_user_item[u]:
continue
else:
if item not in items_rank[u]:
items_rank[u][item] = 0
items_rank[u][item] += score
"""
items_rank 的格式为 {user_id:{item1:score1, item2:score2,...},...}
"""
print("为每个用户进行Top-N推荐....")
items_rank = {k: sorted(v.items(), key=lambda x: x[1], reverse=True)[:N] for k, v in items_rank.items()}
items_rank = {k: set([x[0] for x in v]) for k, v in items_rank.items()}
return items_rank
评测指标
我们一般使用 Precision@N、Recall@N等指标来衡量推荐系统的性能。有关评测指标的内容可以参考 推荐系统–评估方法和评估指标。
def Precision(rec_dict, val_dict):
"""
rec_dict: 推荐列表或评分列表,形式为:{user_id:{item1, item2,....}, user_id:{item1, item2,....}}
val_dict: 用户实际的点击列表或评分列表(测试集),形式为:{user_id:{item1, item2,....}, user_id:{item1, item2,....}}
"""
hit_items = 0
all_items = 0
for user_id, items in val_dict.items():
real_set = items
rec_set = rec_dict[user_id]
for item in rec_set:
if item in real_set:
hit_items += 1
all_items += len(rec_set)
return round(hit_items / all_items * 100, 2)
def Recall(rec_dict, val_dict):
"""
rec_dict: 推荐列表或评分列表,形式为:{user_id:{item1, item2,....}, user_id:{item1, item2,....}}
val_dict: 用户实际的点击列表或评分列表(测试集),形式为:{user_id:{item1, item2,....}, user_id:{item1, item2,....}}
"""
hit_items = 0
all_items = 0
for user_id, items in val_dict.items():
real_set = items
rec_set = rec_dict[user_id]
for item in rec_set:
if item in real_set:
hit_items += 1
all_items += len(real_set)
return round(hit_items / all_items * 100, 2)
案例–基于用户的协同过滤(电影评分预测)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm
def Recall(rec_dict, val_dict):
"""
rec_dict: 推荐列表或评分列表,形式为:{user_id:{item1, item2,....}, user_id:{item1, item2,....}}
val_dict: 用户实际的点击列表或评分列表(测试集),形式为:{user_id:{item1, item2,....}, user_id:{item1, item2,....}}
"""
hit_items = 0
all_items = 0
for user_id, items in val_dict.items():
real_set = items
rec_set = rec_dict[user_id]
for item in rec_set:
if item in real_set:
hit_items += 1
all_items += len(real_set)
return round(hit_items / all_items * 100, 2)
def Precision(rec_dict, val_dict):
"""
rec_dict: 推荐列表或评分列表,形式为:{user_id:{item1, item2,....}, user_id:{item1, item2,....}}
val_dict: 用户实际的点击列表或评分列表(测试集),形式为:{user_id:{item1, item2,....}, user_id:{item1, item2,....}}
"""
hit_items = 0
all_items = 0
for user_id, items in val_dict.items():
real_set = items
rec_set = rec_dict[user_id]
for item in rec_set:
if item in real_set:
hit_items += 1
all_items += len(rec_set)
return round(hit_items / all_items * 100, 2)
def Coverage(rec_dict, train_dict):
"""
rec_dict: 推荐列表或评分列表,形式为:{user_id:{item1, item2,....}, user_id:{item1, item2,....}}
train_dict: 用户实际的点击列表或评分列表(测试集),形式为:{user_id:{item1, item2,....}, user_id:{item1, item2,....}}
"""
hit_items = set()
all_items = set()
for user_id in rec_dict:
for item in train_dict[user_id]:
all_items.add(item)
for item in rec_dict[user_id]:
hit_items.add(item)
return round(len(hit_items) / len(all_items) * 100, 2)
def Popularity(rec_dict, train_dict):
"""
rec_dict: 推荐列表或评分列表,形式为:{user_id:{item1, item2,....}, user_id:{item1, item2,....}}
train_dict: 用户实际的点击列表或评分列表(测试集),形式为:{user_id:{item1, item2,....}, user_id:{item1, item2,....}}
"""
pos_item = {}
for user_id in train_dict:
for item in train_dict[user_id]:
if item not in pos_item:
pos_item[item] = 0
pos_item[item] += 1
pop, num = 0, 0
for user_id in rec_dict:
for item in rec_dict[user_id]:
pop += math.log(pos_item[item] + 1)
num += 1
return round(pop / num, 3)
def rec_eval(val_rec_items, val_user_items, trn_user_items):
print('Recall:',Recall(val_rec_items, val_user_items))
print('Precision',Precision(val_rec_items, val_user_items))
print('Coverage',Coverage(val_rec_items, trn_user_items))
print('Popularity',Popularity(val_rec_items, trn_user_items))
def get_data(data_path):
col_names = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(os.path.join(data_path, "ratings.dat"), sep="::", engine="python", names=col_names)
train_data, val_data, _, _ = train_test_split(ratings, ratings, test_size=0.2)
train_data = train_data.groupby("user_id")["movie_id"].apply(list).reset_index()
val_data = val_data.groupby("user_id")["movie_id"].apply(list).reset_index()
train_user_item = {}
val_user_item = {}
for user_id, movie in zip(*(list(train_data["user_id"]), list(train_data["movie_id"]))):
train_user_item[user_id] = set(movie)
for user_id, movie in zip(*(list(val_data["user_id"]), list(val_data["movie_id"]))):
val_user_item[user_id] = set(movie)
return train_user_item, val_user_item
def item_user_list(train_user_item):
print("建立倒排表....")
items_users = {}
for user_id, items in tqdm(train_user_item.items()):
for item in items:
if item not in items_users:
items_users[item] = set()
items_users[item].add(user_id)
return items_users
def CollaborativeFilterMatrix(train_user_item, items_users):
CFMatrix = {}
num = {}
print("构建协同过滤矩阵....")
for item, users in tqdm(items_users.items()):
for u in users:
if u not in num:
num[u] = 0
num[u] += 1
if u not in CFMatrix:
CFMatrix[u] = {}
for v in users:
if v != u:
if v not in CFMatrix[u]:
CFMatrix[u][v] = 0
CFMatrix[u][v] += 1
return CFMatrix, num
def ComputeSimilarity(CFMatrix, num):
sim = CFMatrix
print("构建用户相似度矩阵....")
for u, other_user in tqdm(CFMatrix.items()):
for v, score in other_user.items():
sim[u][v] = sim[u][v] / np.sqrt(num[u] * num[v])
return sim
def RecForUser(sim, train_user_item, val_user_item, K, N):
print("给测试用户进行推荐....")
items_rank = {}
for u, _ in tqdm(val_user_item.items()):
items_rank[u] = {}
for v, score in sorted(sim[u].items(), key=lambda x:x[1], reverse=True)[:K]:
for item in train_user_item[v]:
if item in train_user_item[u]:
continue
else:
if item not in items_rank[u]:
items_rank[u][item] = 0
items_rank[u][item] += score
print("为每个用户进行Top-N推荐....")
items_rank = {k: sorted(v.items(), key=lambda x: x[1], reverse=True)[:N] for k, v in items_rank.items()}
items_rank = {k: set([x[0] for x in v]) for k, v in items_rank.items()}
return items_rank
if __name__ == "__main__":
root_path = './data/ml-1m/'
train_user_item, val_user_item = get_data(root_path)
items_users = item_user_list(train_user_item)
CFMatrix, num = CollaborativeFilterMatrix(train_user_item, items_users)
sim = ComputeSimilarity(CFMatrix, num)
rec_items = RecForUser(sim, train_user_item, val_user_item, K=80,N=10)
rec_eval(rec_items, val_user_item, train_user_item)
"""
推荐模型评估:
Recall: 10.26
Precision 33.99
Coverage 19.41
Popularity 7.228
"""
代码参考了 https://github.com/datawhalechina/fun-rec?spm=5176.21852664.0.0.7c1147a9gwgeMq
|