from pandas import DataFrame
import pandas as pd
df1 = DataFrame({'key':['b','b','a','c','a','a','b'],
'data1':range(7)})
df2 = DataFrame({'key':['a','b','d'],
'data2':range(3)})
print(df1)
'''
data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 a
6 6 b
'''
print(df2)
'''
data2 key
0 0 a
1 1 b
2 2 d
'''
# merge 根据一个或多个键将不同的dataframe中的行连接起来,类似数据库的连接
# 没有指定列名时,默认用重叠的列名做键
pd.merge(df1,df2)
# inner:交集,merge默认做的inner连接
print(pd.merge(df1,df2,how='inner',on='key'))
'''
data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
'''
# outer:并集
print(pd.merge(df1,df2,how='outer',on='key'))
'''
data1 key data2
0 0.0 b 1.0
1 1.0 b 1.0
2 6.0 b 1.0
3 2.0 a 0.0
4 4.0 a 0.0
5 5.0 a 0.0
6 3.0 c NaN
7 NaN d 2.0
'''
# left:左连接
print(pd.merge(df1,df2,how='left',on='key'))
'''
data1 key data2
0 0 b 1.0
1 1 b 1.0
2 2 a 0.0
3 3 c NaN
4 4 a 0.0
5 5 a 0.0
6 6 b 1.0
'''
# right:右连接
print(pd.merge(df1,df2,how='right',on='key'))
'''
data1 key data2
0 0.0 b 1
1 1.0 b 1
2 6.0 b 1
3 2.0 a 0
4 4.0 a 0
5 5.0 a 0
6 NaN d 2
'''
# 如果两个对象列名不同也可以分别进行指定
df3 = DataFrame({'key1':['b','b','a','c','a','a','b'],
'data1':range(7)})
df4 = DataFrame({'key2':['a','b','d'],
'data2':range(3)})
df_merge2 = pd.merge(df3,df4,left_on='key1',right_on='key2')
print(df_merge2)
'''
data1 key1 data2 key2
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
'''
# 多对多的合并
df5 = DataFrame({'key':['b','a','a','c'],
'data1':range(4)})
df6 = DataFrame({'key':['a','a','a','b','d'],
'data2':range(5)})
print(df5)
'''
data1 key
0 0 b
1 1 a
2 2 a
3 3 c
'''
print(df6)
'''
data2 key
0 0 a
1 1 a
2 2 a
3 3 b
4 4 d
'''
print(pd.merge(df5,df6,on='key',how='inner')) # 多对多连接产生的行是笛卡尔积,左边2个a,右边3个a,最终6个a。
'''
data1 key data2
0 0 b 3
1 1 a 0
2 1 a 1
3 1 a 2
4 2 a 0
5 2 a 1
6 2 a 2
'''
# 要根据多个键进行合并,传入一个由列名组成的列表
left = DataFrame({'key1':['foo','foo','bar'],
'key2':['one','two','one'],
'lval':[1,2,3]})
right = DataFrame({'key1':['foo','foo','bar','bar'],
'key2':['one','one','one','two'],
'rval':[4,5,6,7]})
print(left)
'''
key1 key2 lval
0 foo one 1
1 foo two 2
2 bar one 3
'''
print(right)
'''
key1 key2 rval
0 foo one 4
1 foo one 5
2 bar one 6
3 bar two 7
'''
print(pd.merge(left,right,on=['key1','key2'],how='outer'))
'''
key1 key2 lval rval
0 foo one 1.0 4.0
1 foo one 1.0 5.0
2 foo two 2.0 NaN
3 bar one 3.0 6.0
4 bar two NaN 7.0
'''
# 对于合并运算后需要处理重复列名,suffixes用于指定附加到左右两个dataframe对象的重叠列名上的字符串
print(pd.merge(left,right,on='key1'))
'''
key1 key2_x lval key2_y rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7
'''
print(pd.merge(left,right,on='key1',suffixes=['_left','_right']))
'''
key1 key2_left lval key2_right rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7
'''