其他
软件应用丨Pasdas玩转数据进阶:(一)
版权声明:本文为CSDN博主「向前走别回头」的原创文章合辑,遵循 CC 4.0 BY-SA 版权协议,特此附上原文出处链接及本声明。
原文链接:
https://blog.csdn.net/weixin_39778570/article/details/81105809
https://blog.csdn.net/weixin_39778570/article/details/81106289
https://blog.csdn.net/weixin_39778570/article/details/81106642
https://blog.csdn.net/weixin_39778570/article/details/81107033
https://blog.csdn.net/weixin_39778570/article/details/81107451
简单计算
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
# Series 计算 可以计算加减乘,这里以加法为例
s1 = Series([1,2,3], index=['B','C','D'])
s2 = Series([4,5,6,7], index=['B','C','D','E'])
# 没有的数据为nan
s1 + s2
Out[10]:
B 5.0
C 7.0
D 9.0
E NaN
dtype: float64
# DataFrame计算,可加减乘
df1 = DataFrame(np.arange(4).reshape(2,2), index=['A','B'], columns=['BJ','GZ'])
df1
Out[13]:
BJ GZ
A 0 1
B 2 3
df2 = DataFrame(np.arange(9).reshape(3,3), index=['A','B','C'], columns=['BJ', 'GZ', 'SH'])
df2
Out[15]:
BJ GZ SH
A 0 1 2
B 3 4 5
C 6 7 8
df1+df2
Out[16]:
BJ GZ SH
A 0.0 2.0 NaN
B 5.0 7.0 NaN
C NaN NaN NaN
# DataFrame相关函数
df3 = DataFrame([[1,2,3],[4,5,np.nan],[7,8,9]], index=['A','B','C'], columns=['c1','c2','c3'])
df3
Out[19]:
c1 c2 c3
A 1 2 3.0
B 4 5 NaN
C 7 8 9.0
# 列和
df3.sum()
Out[20]:
c1 12.0
c2 15.0
c3 12.0
dtype: float64
# 行和
df3.sum(axis=1)
Out[21]:
A 6.0
B 9.0
C 24.0
dtype: float64
# 最大值与最小值
df3.max()
Out[22]:
c1 7.0
c2 8.0
c3 9.0
dtype: float64
df3.max(axis=1)
Out[23]:
A 3.0
B 5.0
C 9.0
dtype: float64
df3.min()
Out[24]:
c1 1.0
c2 2.0
c3 3.0
dtype: float64
df3.min(axis=1)
Out[25]:
A 1.0
B 4.0
C 7.0
dtype: float64
# describe描述
df3.describe()
Out[26]:
c1 c2 c3
count 3.0 3.0 2.000000
mean 4.0 5.0 6.000000
std 3.0 3.0 4.242641
min 1.0 2.0 3.000000
25% 2.5 3.5 4.500000
50% 4.0 5.0 6.000000
75% 5.5 6.5 7.500000
max 7.0 8.0 9.000000
左右滑动查看更多
Series和DataFrame排序
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
# Series排序
s1 = Series(np.random.randn(10))
s1
Out[5]:
0 -1.293472
1 0.017588
2 -0.654741
3 0.495720
4 -1.626396
5 -0.651238
6 0.776535
7 -0.746762
8 -1.358951
9 0.247930
dtype: float64
# 值排序
s2 = s1.sort_values()
s2
Out[10]:
4 -1.626396
8 -1.358951
0 -1.293472
7 -0.746762
2 -0.654741
5 -0.651238
1 0.017588
9 0.247930
3 0.495720
6 0.776535
dtype: float64
# 降序排序
s2 = s1.sort_values(ascending=False)
s2
Out[13]:
6 0.776535
3 0.495720
9 0.247930
1 0.017588
5 -0.651238
2 -0.654741
7 -0.746762
0 -1.293472
8 -1.358951
4 -1.626396
dtype: float64
# 对index进行排序,降序同样修改ascending为False就好
s2.sort_index()
Out[14]:
0 -1.293472
1 0.017588
2 -0.654741
3 0.495720
4 -1.626396
5 -0.651238
6 0.776535
7 -0.746762
8 -1.358951
9 0.247930
dtype: float64
# DataFrame排序
df1 = DataFrame(np.random.randn(40).reshape(8,5), columns=['A','B','C','D','E'])
df1
Out[17]:
A B C D E
0 -0.364749 -2.234539 0.560983 -0.205768 -0.685511
1 1.500545 0.669751 -0.810748 -1.499093 -0.369835
2 0.894716 -0.282788 0.293292 1.260618 -0.107138
3 -0.262395 1.970482 1.268629 -0.626314 -0.726878
4 -1.756154 0.471681 -0.204594 -0.978793 -2.082535
5 0.476344 0.588654 -0.303897 1.863167 -1.466623
6 -1.704993 -0.136662 -0.034966 0.159871 -0.848923
7 1.117809 0.548713 -1.713026 1.153380 -1.529988
# 某一列Series排序
df1['A'].sort_values()
Out[18]:
4 -1.756154
6 -1.704993
0 -0.364749
3 -0.262395
5 0.476344
2 0.894716
7 1.117809
1 1.500545
Name: A, dtype: float64
# DataFrame对某列进行排序
df1.sort_values('A')
Out[19]:
A B C D E
4 -1.756154 0.471681 -0.204594 -0.978793 -2.082535
6 -1.704993 -0.136662 -0.034966 0.159871 -0.848923
0 -0.364749 -2.234539 0.560983 -0.205768 -0.685511
3 -0.262395 1.970482 1.268629 -0.626314 -0.726878
5 0.476344 0.588654 -0.303897 1.863167 -1.466623
2 0.894716 -0.282788 0.293292 1.260618 -0.107138
7 1.117809 0.548713 -1.713026 1.153380 -1.529988
1 1.500545 0.669751 -0.810748 -1.499093 -0.369835
# 降序排序
df2 = df1.sort_values('A', ascending=False)
df2
Out[22]:
A B C D E
1 1.500545 0.669751 -0.810748 -1.499093 -0.369835
7 1.117809 0.548713 -1.713026 1.153380 -1.529988
2 0.894716 -0.282788 0.293292 1.260618 -0.107138
5 0.476344 0.588654 -0.303897 1.863167 -1.466623
3 -0.262395 1.970482 1.268629 -0.626314 -0.726878
0 -0.364749 -2.234539 0.560983 -0.205768 -0.685511
6 -1.704993 -0.136662 -0.034966 0.159871 -0.848923
4 -1.756154 0.471681 -0.204594 -0.978793 -2.082535
# 对index进行排序
df2.sort_index()
Out[23]:
A B C D E
0 -0.364749 -2.234539 0.560983 -0.205768 -0.685511
1 1.500545 0.669751 -0.810748 -1.499093 -0.369835
2 0.894716 -0.282788 0.293292 1.260618 -0.107138
3 -0.262395 1.970482 1.268629 -0.626314 -0.726878
4 -1.756154 0.471681 -0.204594 -0.978793 -2.082535
5 0.476344 0.588654 -0.303897 1.863167 -1.466623
6 -1.704993 -0.136662 -0.034966 0.159871 -0.848923
7 1.117809 0.548713 -1.713026 1.153380 -1.529988
# 一个简单的小例子,对movie_metadata.csv的imdb进行排序
f = open('movie_metadata.csv')
movie = pd.read_csv(f)
imdb = movie[["movie_title", "director_name","imdb_score"]].sort_values("imdb_score", ascending=False)
imdb.to_csv('imdb.csv')
左右滑动查看更多
DataFrame重命名
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
df1 = DataFrame(np.arange(9).reshape(3,3), index=['BJ','SH','GZ'], columns=['A','B','C'])
df1
Out[5]:
A B C
BJ 0 1 2
SH 3 4 5
GZ 6 7 8
df1.index
Out[6]: Index(['BJ', 'SH', 'GZ'], dtype='object')
# 方式一,直接用Series修改
df1.index
Out[6]: Index(['BJ', 'SH', 'GZ'], dtype='object')
# 方式二,使用map进行修改
df1.index.map(str.upper)
Out[10]: Index(['BJ', 'SH', 'GZ'], dtype='object')
df1.index = df1.index.map(str.upper)
df1
Out[12]:
A B C
BJ 0 1 2
SH 3 4 5
GZ 6 7 8
# 方式三,使用rename进行修改
df1.rename(index=str.lower)
Out[13]:
A B C
bj 0 1 2
sh 3 4 5
gz 6 7 8
df1
Out[14]:
A B C
BJ 0 1 2
SH 3 4 5
GZ 6 7 8
df1 = df1.rename(index=str.lower, columns=str.lower)
df1
Out[18]:
a b c
bj 0 1 2
sh 3 4 5
gz 6 7 8
# 同时修改行和列
df1 = df1.rename(index=str.lower, columns=str.lower)
df1
Out[18]:
a b c
bj 0 1 2
sh 3 4 5
gz 6 7 8
df1.rename(index={'bj':'beijing'}, columns={'a':'A'})
Out[19]:
A b c
beijing 0 1 2
sh 3 4 5
gz 6 7 8
# 谈一谈map, 从list1到list2的方式
list1 = [1,2,3,4]
list2 = ['1','2','3','4']
# 列表解析
[str(x) for x in list1]
Out[22]: ['1', '2', '3', '4']
# 传入自定义函数
def test_map(x):
return x+'_ABC'
df1.index.map(test_map)
Out[29]: Index(['bj_ABC', 'sh_ABC', 'gz_ABC'], dtype='object')
df1.rename(index=test_map)
Out[30]:
a b c
bj_ABC 0 1 2
sh_ABC 3 4 5
gz_ABC 6 7 8
左右滑动查看更多
DataFrame的merge
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
df1 = DataFrame({'key':['X','Y','Z'], 'data_set_1':[1,2,3]})
df1
Out[5]:
data_set_1 key
0 1 X
1 2 Y
2 3 Z
df2 = DataFrame({'key':['A','B','C'], 'data_set_2':[4,5,6]})
df2
Out[7]:
data_set_2 key
0 4 A
1 5 B
2 6 C
# 没有相同的列值
pd.merge(df1,df2)
Out[8]:
Empty DataFrame
Columns: [data_set_1, key, data_set_2]
Index: []
# 默认合并
df2 = DataFrame({'key':['X','B','C'], 'data_set_2':[4,5,6]})
pd.merge(df1,df2)
Out[10]:
data_set_1 key data_set_2
0 1 X 4
df1 = DataFrame({'key':['X','Y','Z','X'], 'data_set_1':[1,2,3,4]})
pd.merge(df1,df2)
Out[12]:
data_set_1 key data_set_2
0 1 X 4
1 4 X 4
# on为指定列,默认情况下会自动找到相同名列,若指定了不同名列会保错,有两列以上相同的需要指定on
pd.merge(df1,df2,on='key')
Out[13]:
data_set_1 key data_set_2
0 1 X 4
1 4 X 4
# 连接的方式,how=inner(默认),left,right,outer
pd.merge(df1,df2,on='key',how='inner')
Out[15]:
data_set_1 key data_set_2
0 1 X 4
1 4 X 4
pd.merge(df1,df2,on='key',how='left')
Out[16]:
data_set_1 key data_set_2
0 1 X 4.0
1 2 Y NaN
2 3 Z NaN
3 4 X 4.0
pd.merge(df1,df2,on='key',how='right')
Out[17]:
data_set_1 key data_set_2
0 1.0 X 4
1 4.0 X 4
2 NaN B 5
3 NaN C 6
pd.merge(df1,df2,on='key',how='outer')
Out[18]:
data_set_1 key data_set_2
0 1.0 X 4.0
1 4.0 X 4.0
2 2.0 Y NaN
3 3.0 Z NaN
4 NaN B 5.0
5 NaN C 6.0
左右滑动查看更多
Concatenate和Combine
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
# arrange上的Concatenate
arr1 = np.arange(9).reshape(3,3)
arr1
Out[6]:
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
arr2 = np.arange(9).reshape(3,3)
arr2
Out[9]:
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
# 进行concatenate,axis参数表示结合方向,默认0是纵向结合
np.concatenate([arr1,arr2])
Out[10]:
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8],
[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
np.concatenate([arr1,arr2], axis=1)
Out[11]:
array([[0, 1, 2, 0, 1, 2],
[3, 4, 5, 3, 4, 5],
[6, 7, 8, 6, 7, 8]])
# Series上的concatenate
s1 = Series([1,2,3], index=['X','Y','Z'])
S2 = Series([4,5], index=['A','B'])
S2
Out[15]:
A 4
B 5
dtype: int64
pd.concat([s1,S2])
Out[16]:
X 1
Y 2
Z 3
A 4
B 5
# 缺失值会补齐为NaN
pd.concat([s1,S2], axis=1)
Out[17]:
0 1
A NaN 4.0
B NaN 5.0
X 1.0 NaN
Y 2.0 NaN
Z 3.0 NaN
# DataFrame上的comcatenate
df1 = DataFrame(np.random.rand(4,3), columns=['X','Y','Z'])
df1
Out[20]:
X Y Z
0 0.093816 0.087879 0.539844
1 0.087522 0.012905 0.446522
2 0.269924 0.213385 0.900469
3 0.004105 0.437186 0.817560
df2 = DataFrame(np.random.rand(3,3), columns=['X','Y','A'])
df2
Out[22]:
X Y A
0 0.938714 0.122255 0.189125
1 0.592859 0.459991 0.596478
2 0.337845 0.977800 0.401993
pd.concat([df1,df2])
Out[24]:
A X Y Z
0 NaN 0.093816 0.087879 0.539844
1 NaN 0.087522 0.012905 0.446522
2 NaN 0.269924 0.213385 0.900469
3 NaN 0.004105 0.437186 0.817560
0 0.189125 0.938714 0.122255 NaN
1 0.596478 0.592859 0.459991 NaN
2 0.401993 0.337845 0.977800 NaN
pd.concat([df1,df2],axis=1)
Out[25]:
X Y Z X Y A
0 0.093816 0.087879 0.539844 0.938714 0.122255 0.189125
1 0.087522 0.012905 0.446522 0.592859 0.459991 0.596478
2 0.269924 0.213385 0.900469 0.337845 0.977800 0.401993
3 0.004105 0.437186 0.817560 NaN NaN NaN
# Combine,后一个对象补齐前一个对象
# Series
s1 = Series([2,np.nan,4,np.nan], index=['A','B','C','D'])
s1
Out[29]:
A 2.0
B NaN
C 4.0
D NaN
dtype: float64
s2 = Series([1,2,3,4], index=['A','B','C','D'])
s2
Out[31]:
A 1
B 2
C 3
D 4
dtype: int64
# s1中没有的值被s2补齐了
s1.combine_first(s2)
Out[32]:
A 2.0
B 2.0
C 4.0
D 4.0
dtype: float64
# DataFrame,和Series类似
df1 = DataFrame({'X':[1,np.nan,3,np.nan], 'Y':[5,np.nan,7,np.nan], 'Z':[9,np.nan,11,np.nan]})
df1
Out[36]:
X Y Z
0 1.0 5.0 9.0
1 NaN NaN NaN
2 3.0 7.0 11.0
3 NaN NaN NaN
df2 = DataFrame({'Z':[np.nan,10,np.nan,12], 'A':[1,2,3,4]})
df2
Out[38]:
A Z
0 1 NaN
1 2 10.0
2 3 NaN
3 4 12.0
df1.combine_first(df2)
Out[39]:
A X Y Z
0 1.0 1.0 5.0 9.0
1 2.0 NaN NaN 10.0
2 3.0 3.0 7.0 11.0
3 4.0 NaN NaN 12.0
左右滑动查看更多
·END·
软件应用丨Pandas入门系列(三):简单数据处理
软件应用丨Pandas入门系列(二):Pandas io操作
软件应用丨Pandas入门系列(一):深入理解Series和DataFrame
数据Seminar
这里是大数据、分析技术与学术研究的三叉路口
出处:CSDN作者:向前走别回头推荐:青酱排版编辑:青酱
欢迎扫描👇二维码添加关注