用python实现了基础的 Item-based collaborative filtering方法, 更多的内容可以访问我的项目:添加链接描述
import numpy as np
import pandas as pd
import json
from pandas import Series, DataFrame
import math
tdata=pd.read_csv(unicode(r'D:/推荐系统数据集/训练集与测试集/traindata.csv','utf-8'))
tdata.drop(columns='timestamp',inplace=True)
tdata['userId']=tdata['userId'].astype('str')
tdata['movieId']=tdata['movieId'].astype('str')
tdata['rating']=tdata['rating'].astype('float')
df1=tdata.groupby(by=['userId'])
types=tdata['userId'].unique()
info1={}
for type in types:
df3={}
df2=df1.get_group(type)
for i in range(len(df2)):
df3[df2.iloc[i,1]]=df2.iloc[i,2]
info1[type]=df3
info_json=json.dumps(info1,indent=4)
train1=info1
C1=dict()
N1=dict()
for u,items in train1.items():
for i,rui in items.items():
if i not in N1.keys():
N1[i] =0
N1[i] +=1
for j,ruj in items.items():
if i == j:
continue
if i not in C1.keys():
C1.update({i:{j:0}})
if j not in C1[i].keys():
C1[i].update({j:0})
C1[i][j] += 1/((math.log(1+len(items)*1.0))**2)
W1=dict()
for i,related_items in C1.items():
for j,cij in related_items.items():
if i not in W1.keys():
W1.update({i:{j:cij/math.sqrt(N1[i]*N1[j])}})
else:
W1[i].update({j:cij/math.sqrt(N1[i]*N1[j])})
Tdata=pd.read_csv(unicode(r'D:/推荐系统数据集/训练集与测试集/traindata.csv','utf-8'))
Tdata.drop(columns='rating',inplace=True)
Tdata['userId']=Tdata['userId'].astype('str')
Tdata['movieId']=Tdata['movieId'].astype('str')
Tdata['timestamp']=Tdata['timestamp'].astype('float')
print(Tdata)
min1=min(Tdata['timestamp'])
print(min1)
db1=Tdata.groupby(by=['userId'])
types=Tdata['userId'].unique()
info2={}
for type in types:
db3={}
db2=db1.get_group(type)
for i in range(len(db2)):
db3[db2.iloc[i,1]]=db2.iloc[i,2]
info2[type]=db3
info2_json=json.dumps(info2,indent=4)
train2=info2
Testdata=pd.read_csv(unicode(r'D:/推荐系统数据集/训练集与测试集/testdata.csv','utf-8'))
Testdata.drop(columns='timestamp',inplace=True)
Testdata['userId']=Testdata['userId'].astype('str')
Testdata['movieId']=Testdata['movieId'].astype('str')
Testdata['rating']=Testdata['rating'].astype('float')
Testdata
de1=Testdata.groupby(by=['userId'])
types=Testdata['userId'].unique()
info4={}
for type in types:
de3={}
de2=de1.get_group(type)
for i in range(len(de2)):
de3[de2.iloc[i,1]]=de2.iloc[i,2]
info4[type]=de3
info4_json=json.dumps(info4,indent=4)
def GetRank (user,K,train,w):
rank1=dict()
ru=train[str(user)]
list4=list()
for j,ruj in ru.items():
list4.extend([j])
for j,ruj in ru.items():
for i,wij in sorted(w[j].items(),key=lambda x:x[1], reverse=True)[0:10]:
if i in ru.keys():
continue
if str(i) not in rank1.keys():
rank1.update({i:0})
rank1[i]+=wij
dict5 = sorted(rank1.items(), key=lambda x:x[1], reverse=True)
dict6=dict(dict5[0:K])
return dict6
def Getlist(data,testuser):
user_items=list()
for user,related in data.items():
if str(user) == str(testuser):
for i in related.keys():
user_items.extend([i])
return user_items
def Recall(c1,c2,K):
hit=0
all=0
for user in c1.keys():
tu=Getlist(c2,user)
rank=GetRank(user,K,train1,W1)
for i in rank:
if i in tu:
hit +=1
all +=len(tu)
return hit / (all * 1.0)
def Precision(c1,c2,K):
hit=0
all=0
for user in c1.keys():
tu=Getlist(c2,user)
rank=GetRank(user,K,train1,W1)
for i in rank:
if i in tu:
hit +=1
all += K
return hit / (all * 1.0)
def Coverage(train,test,K):
recommend_items = set()
all_items = set()
for user in train.keys():
for item in train[user].keys():
all_items.add(item)
rank = GetRank(user,K,train1,W1)
for item in rank:
recommend_items.add(item)
return len(recommend_items) / (len(all_items)*1.0)
def con(t1,t2,w,K):
hit=0
all=0
t0=893286639.0
for user in t1.keys():
rank=GetRank(user,K,train1,W1)
for i in rank:
for j,tuj in sorted(t2[user].items(),key=lambda x:x[1],reverse=True)[0:10]:
for m,wim in w[i].items():
if j==m:
hit +=wim/(0.01*(abs(t0-tuj)))
all+=K
return hit / (all * 1.0)
for i in range(10,61,10):
con1=con(train1,train2,W1,i)
print("{}".format(con1))
def Popularity(train):
item_popularity = dict()
for user,items in train.items():
for item in items.keys():
if item not in item_popularity:
item_popularity[item] = 0
item_popularity[item] +=1
return item_popularity
popularlity_all=Popularity(train1)
def novel(train,popularlity,K):
net=0
n=0
for user in train.keys():
rank = GetRank(user,K,train1,W1)
for item in rank.keys():
net += math.log(1+popularlity[item],2)
n+=1
net /=n*1.0
return net
for i in range(10,51,10):
novel1=novel(train1,popularlity_all,i)
print("{}".format(novel1))
本文只是实现了简单的item-based协同过滤算法,因为前面有数据的处理,以及后面也编写了评价方法的部分,所以显得稍微长,不过也是为了让初学者能少走点弯路。
如果能够对你有一点帮助,那会令我感到非常荣幸,祝你进步,实现理想。
愿你在这个世界永远被温柔相待,也愿我的姐姐能早日康复。
|