文章目录
数据分析—数据处理工具pandas(四)
六、合并 merge、join
Pandas具有全功能的,高性能内存中连接操作,与SQL等关系数据库非常相似
pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
left_index=False, right_index=False, sort=True,
suffixes=('_x', '_y'), copy=True)
1.merge合并(一)
left,right,on=None, left_on=None, right_on=None,left_index=False, right_index=False 参数解析:
df1 = pd.DataFrame({'key': ['K1', 'K0', 'K2', 'K5'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'A1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(df1)
print(df2)
# merge(left, right, on='None)
# merge默认对on所在的共同列取交集,用on新生成的DataFrame的index是按照符合连接后的行,依次按照顺序往下排的,不是以左边或右边的index为index
print(pd.merge(df1, df2, on='key')) # 使用on,作用在left和right共同的键上
# merge(left, right, left_on=None, right_on=None)
print(pd.merge(df1, df2, left_on='A', right_on='C')) # 按照left和right分别指定的列去拼接,若有相同的列,会自动生成_x,_y
# # merge(left, right, left_index=False, right_index=False)
print(pd.merge(df1, df2, left_index=True, right_index=True)) # 有点横向连接的意思
# 结果:
key A B
0 K1 A0 B0
1 K0 A1 B1
2 K2 A2 B2
3 K5 A3 B3
key C D
0 K0 C0 D0
1 K1 A1 D1
2 K2 C2 D2
3 K3 C3 D3
key A B C D
0 K1 A0 B0 A1 D1
1 K0 A1 B1 C0 D0
2 K2 A2 B2 C2 D2
key_x A B key_y C D
0 K0 A1 B1 K1 A1 D1
key_x A B key_y C D
0 K1 A0 B0 K0 C0 D0
1 K0 A1 B1 K1 A1 D1
2 K2 A2 B2 K2 C2 D2
3 K5 A3 B3 K3 C3 D3
2.merge合并(二)
left_on, right_on, left_index, right_index可以相互组合:
# left_on, right_on, left_index, right_index可以相互组合:
# left_on + right_on, left_on + right_index, left_index + right_on, left_index + right_index
# 举一个left_on + right_index的例子
df1 = pd.DataFrame({
'key1': np.arange(3),
'key2': ['a', 'b', 'c']
})
df2 = pd.DataFrame({
'key3': np.arange(10, 13),
'key4': np.arange(20, 23),
}, index=['a', 'b', 'c'])
print(df1)
print(df2)
print(pd.merge(df1, df2, left_on='key2', right_index=True))
# 结果:
key1 key2
0 0 a
1 1 b
2 2 c
key3 key4
a 10 20
b 11 21
c 12 22
key1 key2 key3 key4
0 0 a 10 20
1 1 b 11 21
2 2 c 12 22
3.merge合并(三)
参数how → 合并方式:
inner交集, outer并集, left以左边为参考, right以右边为参考
df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
df4 = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(df3)
print(df4)
print('交------')
print(pd.merge(df3, df4,on=['key1','key2'], how = 'inner')) # inner:默认,取交集
print('并------')
print(pd.merge(df3, df4, on=['key1','key2'], how = 'outer')) # outer:取并集,数据缺失范围NaN
print('左------')
print(pd.merge(df3, df4, on=['key1','key2'], how = 'left')) # left:按照df3为参考合并,数据缺失范围NaN
# 右类似于左
# 结果:
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K2 K1 A3 B3
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
交------
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
并------
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
5 K2 K0 NaN NaN C3 D3
左------
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
4.merge合并(四)
参数 sort:按照字典顺序通过 连接键 对结果DataFrame进行排序。默认为False,设置为False会大幅提高性能
df1 = pd.DataFrame({'key':list('bbacaab'),
'data1':[1,3,2,4,5,9,7]})
df2 = pd.DataFrame({'key':list('abd'),
'date2':[11,2,33]})
x1 = pd.merge(df1,df2, on = 'key', how = 'outer')
x2 = pd.merge(df1,df2, on = 'key', sort=True, how = 'outer')
print(x1)
print(x2)
print('------')
# 也可直接用Dataframe的排序方法:sort_values,sort_index
print(x2.sort_values('data1'))
# 结果:
key data1 date2
0 b 1.0 2.0
1 b 3.0 2.0
2 b 7.0 2.0
3 a 2.0 11.0
4 a 5.0 11.0
5 a 9.0 11.0
6 c 4.0 NaN
7 d NaN 33.0
key data1 date2
0 a 2.0 11.0
1 a 5.0 11.0
2 a 9.0 11.0
3 b 1.0 2.0
4 b 3.0 2.0
5 b 7.0 2.0
6 c 4.0 NaN
7 d NaN 33.0
------
key data1 date2
3 b 1.0 2.0
0 a 2.0 11.0
4 b 3.0 2.0
6 c 4.0 NaN
1 a 5.0 11.0
5 b 7.0 2.0
2 a 9.0 11.0
7 d NaN 33.0
5.merge合并(五)
参数 suffixes=(’_x’, ‘_y’):若有两个DataFrame有相同的key,连接时没有按照相同的key连接,会自动生成key_x,key_y,用suffixes=(’_x’, ‘_y’)可修改后缀
df1 = pd.DataFrame({'key': ['K1', 'K0', 'K2', 'K5'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'A1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(df1)
print(df2)
print(pd.merge(df1, df2, how='inner', left_index=True, right_index=True))
print(pd.merge(df1, df2, how='inner', left_index=True, right_index=True, suffixes=('_1', '_2')))
# 结果:
key A B
0 K1 A0 B0
1 K0 A1 B1
2 K2 A2 B2
3 K5 A3 B3
key C D
0 K0 C0 D0
1 K1 A1 D1
2 K2 C2 D2
3 K3 C3 D3
key_x A B key_y C D
0 K1 A0 B0 K0 C0 D0
1 K0 A1 B1 K1 A1 D1
2 K2 A2 B2 K2 C2 D2
3 K5 A3 B3 K3 C3 D3
key_1 A B key_2 C D
0 K1 A0 B0 K0 C0 D0
1 K0 A1 B1 K1 A1 D1
2 K2 A2 B2 K2 C2 D2
3 K5 A3 B3 K3 C3 D3
6.pd.join() -> 直接通过索引链接
join()是特殊的merge()
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
print(right)
print(left.join(right, how='outer')) # 等价于:pd.merge(left, right, left_index=True, right_index=True, how='outer')
# 结果:
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
注意:对于merge()和join()方法,一定要指明连接方法,即一定要写how参数
七、连接与修补 concat、combine_first
连接 – 沿轴执行连接操作
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
keys=None, levels=None, names=None, verify_integrity=False,
copy=True)
1.连接:concat(一)
参数 axis :设置横向连接还是竖直连接
s1 = pd.Series([1,2,3])
s2 = pd.Series([2,3,4])
print(pd.concat([s1,s2])) # 默认为竖直连接
print(pd.concat([s1, s2], axis=1)) # axis=1,表示横向连接,成为了一个DataFrame
# 结果:
0 1
1 2
2 3
0 2
1 3
2 4
dtype: int64
0 1
0 1 2
1 2 3
2 3 4
2.连接:concat(二)
参数 join,join_axes:连接方式
s1 = pd.Series(np.arange(4), index=list('abcd'))
s2 = pd.Series(np.arange(10, 14), index=list('cdef'))
# join:{'inner','outer'},默认为“outer”。如何处理其他轴上的索引。outer为联合和inner为交集。
print(pd.concat([s1, s2], join='outer', axis=1))
# join_axes:指定联合的index
print(pd.concat([s1, s2], join_axes=[['b', 'c', 'd']], axis=1)) # 注意join_axes=[[]] 有两个中括号
# 结果:
0 1
a 0.0 NaN
b 1.0 NaN
c 2.0 10.0
d 3.0 11.0
e NaN 12.0
f NaN 13.0
0 1
b 1 NaN
c 2 10.0
d 3 11.0
3.连接:concat(三)
参数 keys:覆盖列名
s1 = pd.Series(np.arange(4), index=list('abcd'))
s2 = pd.Series(np.arange(10, 14), index=list('cdef'))
print(pd.concat([s1, s2], join='outer', axis=1, keys=['a', 'b'])) # 为生成的dataframe的column设置名称
# 结果:
a b
a 0.0 NaN
b 1.0 NaN
c 2.0 10.0
d 3.0 11.0
e NaN 12.0
f NaN 13.0
4.修补 pd.combine_first() 与替换pd.update
df1 = pd.DataFrame([[np.nan, 3., 5.], [-4.6, np.nan, np.nan],[np.nan, 7., np.nan]])
df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5., 1.6, 4]],index=[1, 2])
print(df1)
print(df2)
# 根据index,df1的空值被df2替代
# 如果df2的index多于df1,则更新到df1上,比如index=['a',1]
print(df1.combine_first(df2)) # 将df2中对于的值填入到df1中
# update,直接df2覆盖df1,相同index位置
df1.update(df2)
print(df1)
# 结果:
0 1 2
0 NaN 3.0 5.0
1 -4.6 NaN NaN
2 NaN 7.0 NaN
0 1 2
1 -42.6 NaN -8.2
2 -5.0 1.6 4.0
0 1 2
0 NaN 3.0 5.0
1 -4.6 NaN -8.2
2 -5.0 7.0 4.0
0 1 2
0 NaN 3.0 5.0
1 -42.6 NaN -8.2
2 -5.0 1.6 4.0
注意:merge join concat combine_first 对于Series和DataFrame都适用
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
文章由极客之音整理,本文链接:https://www.bmabk.com/index.php/post/84771.html