##Pandas怎样实现DataFrame的Merge
'''Pandas的Merge,相当于Sql的Join,将不同的表按key关联到一个表
merge的语法:
pd.mergellet, right, howe"inner , on=None, left_on=None,right_on=None, lef_index=False,right index=False, sort=True, suffixes=(_x', '_ y), cop=True,indicator=False, validate=None)
- left,right:要merge的dataframe或者有name的Series- how: join类型,"left" , 'right", 'outer', 'inner'
- on: join的key,left和right都需要有这个key. left_on: left的df或者series的key
. right_on: right的df或者seires的key
. left_index,right_index:使用index而不是普通的column做oin
. suffixes:两个元素的后缀,如果列有重名,自动添加后缀,默认是('_X, '_Y')
文档地址: https:pandas.pydata.orglpandas-docs/stable/referencelapi/pandas.DataFrame.merge.html本次讲解提纲:
1.电影数据集的join实例
2.理解merge时一对一、一对多、多对多的数量对齐关系3.理解left join、right join、inner join、outer join的区别4.如果出现非Key的字段重名怎么办
'''
##1.电影数据集的join实例
import pandas as pd
file1 = "F:\\python387\\pandas\\ratings.dat"#尽量避免长路径
ra = pd.read_csv( file1,
sep ='::',
engine= 'python',
names = "UserID::MovieID::Rating::Timestamp".split("::")
)
file2 = "F:\\python387\\pandas\\users.dat"#尽量避免长路径
us = pd.read_csv( file2,
sep ='::',
engine= 'python',
names = "UserID::Gender::Age::Occupation::Zip-code".split("::")
)
file3 = "F:\\python387\\pandas\\movies.dat"#尽量避免长路径
mo = pd.read_csv( file3,
sep ='::',
engine= 'python',
names = "MovieID::Title::Genres".split("::"),encoding = "ISO-8859-1"
)#encoding = "ISO-8859-1"解决'UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 3114: invalid continuation byte'这个问题
df_ratings_uers = pd.merge(
ra,us,left_on ='UserID',right_on ='UserID',how ='inner'
)
df_ratings_uers_movies = pd.merge(
df_ratings_uers,mo,left_on ="MovieID",right_on ="MovieID",how='inner'
)
#print(df_ratings_uers_movies.head())
##2、理解merge时数量的对齐关系以下关系要正确理解:
'''
. one-to-one:一对一关系,关联的key都是唯一的
-L比如(学号,姓名) merge (学号,年龄)-结果条数为:1*1
one-to-many:—对多关系,左边唯一key,右边不唯一key
-比如(学号,姓名) merge (学号,[语文或绩、数学成绩、英语成绩])-结果条数为;1*N
- many-to-many:多对多关系,左边右边部不是唯一的
. ik如(学号,[语文成绩、数学成绩、英语成绩]》merge (学号。[篮球、足球、乒乓球)·结果条数为:M*N'''
#一对一关系的merge
left = pd.DataFrame({'sno':[11,12,13,14],'name':['a','b','c','d']})
right =pd.DataFrame({'sno':[11,12,13,14],'age':[23,24,25,26]})
#一对多关系的merge
right1= pd.DataFrame({'sno':[11,11,11,12,12,13],
'grade':['语文88','英语72','数学89','英语99','语文67','化学90']})
#print(pd.merge(left,right1,on = 'sno'))
#多对多关系的merge
left1 = pd.DataFrame({'sno':[11,11,12,12,13,14],'name':['a','b','c','d','e','f']})
#print(pd.merge(left1,right1,on = 'sno'))
##3、理解left join、right join、inner join、outer join的区别
left = pd.DataFrame({'key':['KO','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['KO','K1','K4','K5'],
'C':['C0','C1','C4','C5'],
'D':['D0','D1','D4','D5']
})
#3.1inner join,默认
m=pd.merge(left,right, how='inner')
left_join = pd.merge(left,right, how='left')
right_join = pd.merge(left,right, how='right')
outer_join = pd.merge(left,right, how='outer')
print(outer_join )
##4.如果出现非key的字段重名怎么办
left1 = pd.DataFrame({'key':['KO','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right1 = pd.DataFrame({'key':['KO','K1','K4','K5'],
'A':['A0','A1','A2','A3'],
'D':['D0','D1','D4','D5']})
m = pd.merge(left1, right1, on ='key')
m1 = pd.merge(left1,right1,on='key',suffixes=('_left','_right'))#换成指定的后缀
print(m1)
|