##读取、处理Yelp数据集
import pandas as pd
import json
reviews_path =r"../yelp_dataset/yelp_academic_dataset_review.json"
'''
读取原始数据
'''
file = open(reviews_path, 'r', encoding="utf-8")
users_id = []
items_id = []
ratings = []
reviews = []
dates = []
for line in file:
js = json.loads(line)
print(js)
if str(js['user_id']) == 'unknown':
print("unknown user id")
continue
if str(js['business_id']) == 'unknown':
print("unkown item id")
continue
date = str(js["date"])
_date = date.split('-')[0]
if _date == '2017':
reviews.append(js['text'])
users_id.append(str(js['user_id']))
items_id.append(str(js['business_id']))
ratings.append(str(js['stars']))
dates.append(date)
data_frame = {'user_id': pd.Series(users_id), 'item_id': pd.Series(items_id),
'ratings': pd.Series(ratings), 'reviews': pd.Series(reviews),'date': pd.Series(dates)}
data = pd.DataFrame(data_frame)
print(data)
'''
将u_id和i_id进行唯一编码
'''
userID = list(set(users_id))
itemID = list(set(items_id))
item2id = {key:index for index, key in enumerate(itemID)}
user2id = {key:index for index, key in enumerate(userID)}
def numerize(data,user2id,item2id):
uid = list(map(lambda x: user2id[x], data['user_id']))
iid = list(map(lambda x: item2id[x], data['item_id']))
data['user_id'] = uid
data['item_id'] = iid
return data
data = numerize(data,user2id,item2id)
print(data)
|