运行环境
- windows11
- JDK 8
- anaconda3
- python 3.9
- Neo4j 3.5.32
- python jupyter库
- py2neo
- Visual Studio Code 2022
项目地址: Gitee : https://gitee.com/ccuni/py2neo-neo4j-actual-combat
GitHub:https://github.com/unirithe/py2neo-neo4j-actual-combat
一、数据集说明
数据集来自 IMDB 影视网的电影、演员数据,数据并不全,仅供学习参考。 数据集参考上方的 Gitee 或 GitHub地址
-
movie_act.csv 演员id到电影id的映射信息 -
data:image/s3,"s3://crabby-images/0dbfe/0dbfef762c0bab3d84a5aeb1879e9a953fc872bf" alt="在这里插入图片描述" -
movie_actor.csv 5334个演员的信息,名称和头像 data:image/s3,"s3://crabby-images/5191a/5191aef48c7d527b4a8ffee28b35f3226a360170" alt="在这里插入图片描述" -
movie_moive.csv 2926部电影的详情信息 data:image/s3,"s3://crabby-images/03f45/03f452557f2b6aa0b038ad64a49ce87491b80f1b" alt="在这里插入图片描述" -
movie_popularity.csv 保留着62部受欢迎的电影信息 data:image/s3,"s3://crabby-images/12684/126846a2951de700e9254317199b3472c9c2612b" alt="在这里插入图片描述" -
user_user.csv 不知道有啥用的id信息 data:image/s3,"s3://crabby-images/00054/000541d5794880b44303ce9436294f5395670db3" alt="在这里插入图片描述"
二、数据预处理
这里将原先的csv数据转为 pandas的DataFrame后再转化成字典,从而能构建Node对象,插入到Neo4j中
2.1 选择受欢迎的电影
list_mid = df['popularity']['movieid_id']
df_popularity_movie = df['movie'][df['movie']['movieid'].isin(list_mid)]
df_popularity_movie
data:image/s3,"s3://crabby-images/4388e/4388e029578abd21d947f24994115b81b07139ae" alt="在这里插入图片描述"
dict_movie = {}
for i in range(len(df_popularity_movie)):
row = df_popularity_movie.iloc[i]
dict_movie.update({row['movieid'] : row.to_dict()})
print('rows: ' , len(dict_movie))
data:image/s3,"s3://crabby-images/216bc/216bc16c2f61e42721c405e5b7d762c8f8cfcf79" alt="在这里插入图片描述"
2.2 查找每部受欢迎电影的所有演员
dict_actor_movie = {}
for mid in df_popularity_movie['movieid']:
flag = df['actor_movie']['movieid_id'].eq(mid)
actors = df['actor_movie'][flag]['actorid_id'].to_list()
dict_actor_movie.update({mid : actors})
print('rows: ' , len(dict_actor_movie))
data:image/s3,"s3://crabby-images/d527a/d527a3081b6213e417e59279a35ae3d4e75a00f8" alt="在这里插入图片描述"
2.3 查找热门电影里每个演员的信息
dict_actor = {}
actors = set()
for ac in dict_actor_movie.values():
for actor in ac:
actors.add(actor)
for aid in actors:
flag = (df['actor']['actorid'] == aid)
row = df['actor'][flag].iloc[0]
dict_actor.update({aid: row.to_dict()})
print('rows: ' , len(dict_actor_movie))
data:image/s3,"s3://crabby-images/f9731/f973170dee995c5cb6e8ed6abcdc78a1ce98ca21" alt="在这里插入图片描述"
三、Py2Neo 操作
3.1 连接 Neo4j
from py2neo import Graph
url = "http://localhost:7474"
username = "neo4j"
password = "123456"
graph = Graph(url, auth=(username, password))
print("neo4j info: {}".format(str(graph)))
输出结果:neo4j info: Graph(‘http://localhost:7474’)
3.2 插入电影和演员节点
from py2neo import Graph, Node, Subgraph
import time
s_time = time.time()
node_list = []
graph.delete_all()
for mid, movie in dict_movie.items():
node_list.append(Node("movie", **movie))
for aid, actor in dict_actor.items():
node_list.append(Node("actor", **actor))
graph.create(subgraph=Subgraph(node_list))
print(node_list[0])
data:image/s3,"s3://crabby-images/868aa/868aabf13b1d0f33cb9c3fb158559b85a1fa7ba4" alt="在这里插入图片描述"
输出当前Neo4j 电影和演员节点的个数
print('movie: ', graph.nodes.match("movie").count())
print('actor: ', graph.nodes.match('actor').count())
输出结果: movie: 62 actor: 240
3.3 建立电影和演员之间的联系
from py2neo import NodeMatcher, Relationship
node_matcher = NodeMatcher(graph)
list_rel = []
for mid, actors in dict_actor_movie.items():
node_movie = node_matcher.match("movie", movieid=mid).first()
if node_movie != None:
for actor in actors:
node_actor = node_matcher.match("actor", actorid=actor).first()
if node_actor != None:
list_rel.append(Relationship(node_actor, "acted", node_movie, name='acted'))
once = 50
maxi = len(list_rel)
for i in range(0, maxi, once):
subgraph = Subgraph(relationships=list_rel[i:i+once])
graph.separate(subgraph)
graph.create(subgraph)
print(f'[INFO] >> created {len(subgraph)} relations')
输出结果: data:image/s3,"s3://crabby-images/9c618/9c618e9e81ebe28d39ea873859f7f9e2c0d5035b" alt="在这里插入图片描述"
登录 Neo4j 的web页面查询插入的结果:http://localhost:7474
data:image/s3,"s3://crabby-images/de7c9/de7c9146d710bc08508bbe3fa5790e43d7d199ce" alt="`"
四、基于Neo4j的数据分析
4.1 查找电影的所有关系
from py2neo import RelationshipMatcher
rmatcher = RelationshipMatcher(graph)
i = 0
for node_movie in graph.nodes.match('movie').all():
print(i, '-' * 10 , node_movie['name'] + '-' *10)
for rel in graph.match([None, node_movie]).all():
print('--', rel)
i += 1
print('\n\n')
部分输出结果:(共有62部受欢迎的电影) data:image/s3,"s3://crabby-images/4b5fe/4b5fe458b7c7d1e6d03b3e28e149f6271ba1b381" alt="在这里插入图片描述"
4.2 查找根据演员数和评分排序的Top10电影
nodes_movie = graph.nodes.match('movie').all()
rm = RelationshipMatcher(graph)
'''
Top10
'''
dict_movie_top10 = {}
for node_movie in nodes_movie:
list_actors = rm.match([None, node_movie], r_type='acted').all()
count = len(list_actors)
dict_movie_top10.update({node_movie: {'count':int(count), 'actors':list_actors}})
list_movie_top10 = sorted(dict_movie_top10.items(),
key = lambda k : (k[1]['count'], float(k[0]['rate'])), reverse=True)[:10]
print('------------------ Top10 ------------------')
for node_movie, dict_count in list_movie_top10:
print(dict_count['count'], node_movie['rate'], node_movie['name'])
输出结果: data:image/s3,"s3://crabby-images/d20f7/d20f79a86474534445e1150de5902f897c2cfbc9" alt="在这里插入图片描述"
翻译过后: Translate to chinese
排名 | 评分 | 电影名称 |
---|
1 | 9.1 | 《肖申克的救赎》 | 2 | 9.1 | 《Dekalog》 | 3 | 9.0 | 《黑暗骑士》 | 4 | 9.0 | 《教父:第二部》 | 5 | 8.9 | 《低俗小说》 | 6 | 8.8 | 《费城总是阳光明媚》 | 7 | 8.8 | 《星球大战5:帝国反击战》 | 8 | 8.8 | 《搏击俱乐部》 | 9 | 8.7 | 《指环王:双塔奇兵》 | 10 | 8.6 | 《星球大战》 |
4.3 保存 Top10数据到 Neo4j
graph.delete(Subgraph(graph.nodes.match('actor_top10').all()))
graph.delete(Subgraph(graph.nodes.match('movie_top10').all()))
graph.delete(Subgraph(RelationshipMatcher(graph).match(name='acted_top10')))
rel_top10 = []
nodeMatcher = NodeMatcher(graph)
for node_movie, dict_count in list_movie_top10:
for actor_rel in dict_count['actors']:
actor = Node('actor_top10', **dict(actor_rel.start_node))
movie = Node('movie_top10', **dict(node_movie))
actor_find = nodeMatcher.match('actor_top10', name=actor['name']).first()
movie_find = nodeMatcher.match('movie_top10', name=movie['name']).first()
if actor_find != None: ator = actor_find
if movie_find != None: movie = movie_find
rel_top10.append(Relationship(actor, "acted", movie, name='acted_top10'))
sub_rels=Subgraph(relationships=rel_top10)
graph.separate(subgraph=sub_rels)
graph.create(subgraph = sub_rels)
print('The number of actor_top10 node: ',graph.nodes.match('actor_top10').count())
print('The number of moive_top10 node: ', graph.nodes.match('movie_top10').count())
print('The number of relationsip: ', graph.relationships.match(name='acted_top10').count())
输出结果: data:image/s3,"s3://crabby-images/405f0/405f02e69d315201117ce255b993fee2e47aaa0d" alt="在这里插入图片描述"
从 web中查询的结果如下: data:image/s3,"s3://crabby-images/5ecdd/5ecdd2aa9c768294a4896448c634441b125bc903" alt="在这里插入图片描述"
五、总结
通过本次的尝试,我们使用py2neo进行了Neo4j的增删改查,熟悉使用 Node、Relationship、Graph,另外,还有大量的 pandas相关的操作。最终分析了影视电影和演员之间的关系,当然还有更多指标可以分析,比如:出现次数最多的演员以及电影、同步出现率最高的电影等等。
py2neo实现neo4j的增删改查还是挺轻松的。
|