其他
软件应用丨Pandas玩转数据进阶:(三)
版权声明:本文为CSDN博主「向前走别回头」的原创文章合辑,遵循 CC 4.0 BY-SA 版权协议,特此附上原文出处链接及本声明。
原文链接:
https://blog.csdn.net/weixin_39778570/article/details/81117289
https://blog.csdn.net/weixin_39778570/article/details/81117881
https://blog.csdn.net/weixin_39778570/article/details/81118513
https://blog.csdn.net/weixin_39778570/article/details/81119547
https://blog.csdn.net/weixin_39778570/article/details/81124608
点此回顾 软件应用丨Pandas玩转数据进阶:(二)
数据分箱技术Binning
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
# 随机生成一段成绩
score_list = np.random.randint(25, 100, size=20)
score_list
Out[5]:
array([53, 67, 36, 37, 30, 63, 83, 83, 54, 97, 32, 74, 52, 47, 82, 71, 59,
59, 97, 25])
# 进行cut分箱, bins为剪切范围
bins = [0,59,70,80,100]
pd.cut(score_list, bins)
Out[7]:
[(0, 59], (59, 70], (0, 59], (0, 59], (0, 59], ..., (70, 80], (0, 59], (0, 59], (80, 100], (0, 59]]
Length: 20
Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]
# 查看数据,通过pd.value_counts()查看Categories数据
score_cat = pd.cut(score_list, bins)
pd.value_counts(score_cat)
Out[9]:
(0, 59] 11
(80, 100] 5
(70, 80] 2
(59, 70] 2
dtype: int64
# 模拟一段数据
df = DataFrame()
df['score'] = score_list
df
Out[12]:
score
0 53
1 67
2 36
3 37
4 30
5 63
6 83
7 83
8 54
9 97
10 32
11 74
12 52
13 47
14 82
15 71
16 59
17 59
18 97
19 25
# 随机生成3个字符pd.util.testing.rands(3)
df['student'] = [pd.util.testing.rands(3) for i in range(20)]
df
Out[14]:
score student
0 53 6DW
1 67 4ji
2 36 1pp
3 37 h0w
4 30 21w
5 63 H7d
6 83 DDu
7 83 vFc
8 54 vdb
9 97 KNc
10 32 3CG
11 74 hW6
12 52 IsC
13 47 mqx
14 82 p9J
15 71 d86
16 59 ND9
17 59 iVH
18 97 MMt
19 25 qxW
# 对score进行分箱
pd.cut(df['score'], bins)
Out[15]:
0 (0, 59]
1 (59, 70]
2 (0, 59]
3 (0, 59]
4 (0, 59]
5 (59, 70]
6 (80, 100]
7 (80, 100]
8 (0, 59]
9 (80, 100]
10 (0, 59]
11 (70, 80]
12 (0, 59]
13 (0, 59]
14 (80, 100]
15 (70, 80]
16 (0, 59]
17 (0, 59]
18 (80, 100]
19 (0, 59]
Name: score, dtype: category
Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]
# 传入labels标签,进行分箱标记
df['categories'] = pd.cut(df['score'], bins, labels=['low','ok','good','great'])
df
Out[20]:
score student categories
0 53 6DW low
1 67 4ji ok
2 36 1pp low
3 37 h0w low
4 30 21w low
5 63 H7d ok
6 83 DDu great
7 83 vFc great
8 54 vdb low
9 97 KNc great
10 32 3CG low
11 74 hW6 good
12 52 IsC low
13 47 mqx low
14 82 p9J great
15 71 d86 good
16 59 ND9 low
17 59 iVH low
18 97 MMt great
19 25 qxW low
左右滑动查看更多
数据分组技术Groupby
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
# 打开文件
f = open('city_weather.csv')
df = pd.read_csv(f)
df
Out[7]:
date city temperature wind
0 03/01/2016 BJ 8 5
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
3 14/02/2016 BJ -3 3
4 28/02/2016 BJ 19 2
5 13/03/2016 BJ 5 3
6 27/03/2016 SH -4 4
7 10/04/2016 SH 19 3
8 24/04/2016 SH 20 3
9 08/05/2016 SH 17 3
10 22/05/2016 SH 4 2
11 05/06/2016 SH -10 4
12 19/06/2016 SH 0 5
13 03/07/2016 SH -9 5
14 17/07/2016 GZ 10 2
15 31/07/2016 GZ -1 5
16 14/08/2016 GZ 1 5
17 28/08/2016 GZ 25 4
18 11/09/2016 SZ 20 1
19 25/09/2016 SZ -10 4
# 进行分组
g = df.groupby(df['city'])
g
Out[9]: <pandas.core.groupby.DataFrameGroupBy object at 0x0000026486F22BA8>
g.groups
Out[10]:
{'BJ': Int64Index([0, 1, 2, 3, 4, 5], dtype='int64'),
'GZ': Int64Index([14, 15, 16, 17], dtype='int64'),
'SH': Int64Index([6, 7, 8, 9, 10, 11, 12, 13], dtype='int64'),
'SZ': Int64Index([18, 19], dtype='int64')}
# 获取某一个组
g.get_group('BJ')
Out[11]:
date city temperature wind
0 03/01/2016 BJ 8 5
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
3 14/02/2016 BJ -3 3
4 28/02/2016 BJ 19 2
5 13/03/2016 BJ 5 3
# 对某一个组进行处理
df_bj = g.get_group('BJ')
df_bj.mean()
Out[14]:
temperature 10.000000
wind 2.833333
dtype: float64
df_bj.describe()
Out[15]:
temperature wind
count 6.000000 6.000000
mean 10.000000 2.833333
std 8.532292 1.169045
min -3.000000 2.000000
25% 5.750000 2.000000
50% 10.000000 2.500000
75% 17.250000 3.000000
max 19.000000 5.000000
# 查看整个groupby
g.mean()
Out[16]:
temperature wind
city
BJ 10.000 2.833333
GZ 8.750 4.000000
SH 4.625 3.625000
SZ 5.000 2.500000
g.describe()
Out[17]:
temperature wind \
count mean std min 25% 50% 75% max count
city
BJ 6.0 10.000 8.532292 -3.0 5.75 10.0 17.25 19.0 6.0
GZ 4.0 8.750 11.842719 -1.0 0.50 5.5 13.75 25.0 4.0
SH 8.0 4.625 12.489281 -10.0 -5.25 2.0 17.50 20.0 8.0
SZ 2.0 5.000 21.213203 -10.0 -2.50 5.0 12.50 20.0 2.0
mean std min 25% 50% 75% max
city
BJ 2.833333 1.169045 2.0 2.00 2.5 3.00 5.0
GZ 4.000000 1.414214 2.0 3.50 4.5 5.00 5.0
SH 3.625000 1.060660 2.0 3.00 3.5 4.25 5.0
SZ 2.500000 2.121320 1.0 1.75 2.5 3.25 4.0
# groupby可转换为列表,列表中为元组,元组中第一个值为分组名,第二个值为dataframe
list(g)
Out[18]:
[('BJ', date city temperature wind
0 03/01/2016 BJ 8 5
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
3 14/02/2016 BJ -3 3
4 28/02/2016 BJ 19 2
5 13/03/2016 BJ 5 3),
('GZ', date city temperature wind
14 17/07/2016 GZ 10 2
15 31/07/2016 GZ -1 5
16 14/08/2016 GZ 1 5
17 28/08/2016 GZ 25 4),
('SH', date city temperature wind
6 27/03/2016 SH -4 4
7 10/04/2016 SH 19 3
8 24/04/2016 SH 20 3
9 08/05/2016 SH 17 3
10 22/05/2016 SH 4 2
11 05/06/2016 SH -10 4
12 19/06/2016 SH 0 5
13 03/07/2016 SH -9 5),
('SZ', date city temperature wind
18 11/09/2016 SZ 20 1
19 25/09/2016 SZ -10 4)]
# 可以装换为字典
dict(list(g))
Out[19]:
{'BJ': date city temperature wind
0 03/01/2016 BJ 8 5
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
3 14/02/2016 BJ -3 3
4 28/02/2016 BJ 19 2
5 13/03/2016 BJ 5 3,
'GZ': date city temperature wind
14 17/07/2016 GZ 10 2
15 31/07/2016 GZ -1 5
16 14/08/2016 GZ 1 5
17 28/08/2016 GZ 25 4,
'SH': date city temperature wind
6 27/03/2016 SH -4 4
7 10/04/2016 SH 19 3
8 24/04/2016 SH 20 3
9 08/05/2016 SH 17 3
10 22/05/2016 SH 4 2
11 05/06/2016 SH -10 4
12 19/06/2016 SH 0 5
13 03/07/2016 SH -9 5,
'SZ': date city temperature wind
18 11/09/2016 SZ 20 1
19 25/09/2016 SZ -10 4}
# 打印groupby的中每个组的两个参数,name和dataframe
for name, group_df in g:
print(name)
print(group_df)
BJ
date city temperature wind
0 03/01/2016 BJ 8 5
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
3 14/02/2016 BJ -3 3
4 28/02/2016 BJ 19 2
5 13/03/2016 BJ 5 3
GZ
date city temperature wind
14 17/07/2016 GZ 10 2
15 31/07/2016 GZ -1 5
16 14/08/2016 GZ 1 5
17 28/08/2016 GZ 25 4
SH
date city temperature wind
6 27/03/2016 SH -4 4
7 10/04/2016 SH 19 3
8 24/04/2016 SH 20 3
9 08/05/2016 SH 17 3
10 22/05/2016 SH 4 2
11 05/06/2016 SH -10 4
12 19/06/2016 SH 0 5
13 03/07/2016 SH -9 5
SZ
date city temperature wind
18 11/09/2016 SZ 20 1
19 25/09/2016 SZ -10 4
数据聚合技术Aggregation
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
# 打开csv文件
f = open('city_weather.csv')
df = pd.read_csv(f)
df
Out[12]:
date city temperature wind
0 03/01/2016 BJ 8 5
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
3 14/02/2016 BJ -3 3
4 28/02/2016 BJ 19 2
5 13/03/2016 BJ 5 3
6 27/03/2016 SH -4 4
7 10/04/2016 SH 19 3
8 24/04/2016 SH 20 3
9 08/05/2016 SH 17 3
10 22/05/2016 SH 4 2
11 05/06/2016 SH -10 4
12 19/06/2016 SH 0 5
13 03/07/2016 SH -9 5
14 17/07/2016 GZ 10 2
15 31/07/2016 GZ -1 5
16 14/08/2016 GZ 1 5
17 28/08/2016 GZ 25 4
18 11/09/2016 SZ 20 1
19 25/09/2016 SZ -10 4
# 对df进行分组
g = df.groupby('city')
# 实验聚合函数min
g.agg('min')
Out[7]:
date temperature wind
city
BJ 03/01/2016 -3 2
GZ 14/08/2016 -1 2
SH 03/07/2016 -10 2
SZ 11/09/2016 -10 1
# 自定义函数,目的查看传如函数的数据类型
def foo (attr):
print(type(attr))
return np.nan
#传入一个Sreies对象
g.agg(foo)
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
Out[9]:
date temperature wind
city
BJ NaN NaN NaN
GZ NaN NaN NaN
SH NaN NaN NaN
SZ NaN NaN NaN
# 自定义函数进行聚合
def foo (attr):
return attr.max() - attr.min()
g.agg(foo)
Out[11]:
temperature wind
city
BJ 22 3
GZ 26 3
SH 30 3
SZ 30 3
# 可以对两列进行分组
g_new = df.groupby(['city', 'wind'])
g_new
Out[14]: <pandas.core.groupby.DataFrameGroupBy object at 0x000001C7B3DCDE10>
g_new.groups
Out[15]:
{('BJ', 2): Int64Index([1, 2, 4], dtype='int64'),
('BJ', 3): Int64Index([3, 5], dtype='int64'),
('BJ', 5): Int64Index([0], dtype='int64'),
('GZ', 2): Int64Index([14], dtype='int64'),
('GZ', 4): Int64Index([17], dtype='int64'),
('GZ', 5): Int64Index([15, 16], dtype='int64'),
('SH', 2): Int64Index([10], dtype='int64'),
('SH', 3): Int64Index([7, 8, 9], dtype='int64'),
('SH', 4): Int64Index([6, 11], dtype='int64'),
('SH', 5): Int64Index([12, 13], dtype='int64'),
('SZ', 1): Int64Index([18], dtype='int64'),
('SZ', 4): Int64Index([19], dtype='int64')}
# 获取某一组
g_new.get_group(('BJ',2))
Out[18]:
date city temperature wind
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
4 28/02/2016 BJ 19 2
# 打印每个组
n [19]: for (name_1,name_2), group in g_new:
print(name_1,name_2)
print(group)
BJ 2
date city temperature wind
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
4 28/02/2016 BJ 19 2
BJ 3
date city temperature wind
3 14/02/2016 BJ -3 3
5 13/03/2016 BJ 5 3
BJ 5
date city temperature wind
0 03/01/2016 BJ 8 5
GZ 2
date city temperature wind
14 17/07/2016 GZ 10 2
GZ 4
date city temperature wind
17 28/08/2016 GZ 25 4
GZ 5
date city temperature wind
15 31/07/2016 GZ -1 5
16 14/08/2016 GZ 1 5
SH 2
date city temperature wind
10 22/05/2016 SH 4 2
SH 3
date city temperature wind
7 10/04/2016 SH 19 3
8 24/04/2016 SH 20 3
9 08/05/2016 SH 17 3
SH 4
date city temperature wind
6 27/03/2016 SH -4 4
11 05/06/2016 SH -10 4
SH 5
date city temperature wind
12 19/06/2016 SH 0 5
13 03/07/2016 SH -9 5
SZ 1
date city temperature wind
18 11/09/2016 SZ 20 1
SZ 4
date city temperature wind
19 25/09/2016 SZ -10 4
左右滑动查看更多
透视表
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
# 打开excel文件
xlsx = pd.ExcelFile('sales-funnel.xlsx')
df = pd.read_excel(xlsx)
# 生成透视表
# aggfunc默认取平均值
pd.pivot_table(df,index=['Name'])
Out[13]:
Account Price Quantity
Name
Barton LLC 740150.0 35000.0 1.000000
Fritsch, Russel and Anderson 737550.0 35000.0 1.000000
Herman LLC 141962.0 65000.0 2.000000
Jerde-Hilpert 412290.0 5000.0 2.000000
Kassulke, Ondricka and Metz 307599.0 7000.0 3.000000
Keeling LLC 688981.0 100000.0 5.000000
Kiehn-Spinka 146832.0 65000.0 2.000000
Koepp Ltd 729833.0 35000.0 2.000000
Kulas Inc 218895.0 25000.0 1.500000
Purdy-Kunde 163416.0 30000.0 1.000000
Stokes LLC 239344.0 7500.0 1.000000
Trantow-Barrows 714466.0 15000.0 1.333333
# 可以设置多个index,多级
pd.pivot_table(df, index=['Name','Rep','Manager'])
Out[15]:
Account Price \
Name Rep Manager
Barton LLC John Smith Debra Henley 740150.0 35000.0
Fritsch, Russel and Anderson Craig Booker Debra Henley 737550.0 35000.0
Herman LLC Cedric Moss Fred Anderson 141962.0 65000.0
Jerde-Hilpert John Smith Debra Henley 412290.0 5000.0
Kassulke, Ondricka and Metz Wendy Yule Fred Anderson 307599.0 7000.0
Keeling LLC Wendy Yule Fred Anderson 688981.0 100000.0
Kiehn-Spinka Daniel Hilton Debra Henley 146832.0 65000.0
Koepp Ltd Wendy Yule Fred Anderson 729833.0 35000.0
Kulas Inc Daniel Hilton Debra Henley 218895.0 25000.0
Purdy-Kunde Cedric Moss Fred Anderson 163416.0 30000.0
Stokes LLC Cedric Moss Fred Anderson 239344.0 7500.0
Trantow-Barrows Craig Booker Debra Henley 714466.0 15000.0
Quantity
Name Rep Manager
Barton LLC John Smith Debra Henley 1.000000
Fritsch, Russel and Anderson Craig Booker Debra Henley 1.000000
Herman LLC Cedric Moss Fred Anderson 2.000000
Jerde-Hilpert John Smith Debra Henley 2.000000
Kassulke, Ondricka and Metz Wendy Yule Fred Anderson 3.000000
Keeling LLC Wendy Yule Fred Anderson 5.000000
Kiehn-Spinka Daniel Hilton Debra Henley 2.000000
Koepp Ltd Wendy Yule Fred Anderson 2.000000
Kulas Inc Daniel Hilton Debra Henley 1.500000
Purdy-Kunde Cedric Moss Fred Anderson 1.000000
Stokes LLC Cedric Moss Fred Anderson 1.000000
Trantow-Barrows Craig Booker Debra Henley 1.333333
# 多级index,经理和销售顾问
pd.pivot_table(df, index=['Manager','Rep'])
Out[17]:
Account Price Quantity
Manager Rep
Debra Henley Craig Booker 720237.0 20000.000000 1.250000
Daniel Hilton 194874.0 38333.333333 1.666667
John Smith 576220.0 20000.000000 1.500000
Fred Anderson Cedric Moss 196016.5 27500.000000 1.250000
Wendy Yule 614061.5 44250.000000 3.000000
# 可以指定生成那个values
pd.pivot_table(df, index=['Manager','Rep'], values=['Price'])
Out[18]:
Price
Manager Rep
Debra Henley Craig Booker 20000.000000
Daniel Hilton 38333.333333
John Smith 20000.000000
Fred Anderson Cedric Moss 27500.000000
Wendy Yule 44250.000000
# 可多个values,修改聚合函数
pd.pivot_table(df, index=['Manager','Rep'], values=['Price','Quantity'], aggfunc='sum')
Out[19]:
Price Quantity
Manager Rep
Debra Henley Craig Booker 80000 5
Daniel Hilton 115000 5
John Smith 40000 3
Fred Anderson Cedric Moss 110000 5
Wendy Yule 177000 12
# 可以添加colunms,多级columns
pd.pivot_table(df, index=['Manager','Rep'], values=['Price','Quantity'], columns=['Product'],aggfunc='sum')
Out[20]:
Price Quantity \
Product CPU Maintenance Monitor Software CPU
Manager Rep
Debra Henley Craig Booker 65000.0 5000.0 NaN 10000.0 2.0
Daniel Hilton 105000.0 NaN NaN 10000.0 4.0
John Smith 35000.0 5000.0 NaN NaN 1.0
Fred Anderson Cedric Moss 95000.0 5000.0 NaN 10000.0 3.0
Wendy Yule 165000.0 7000.0 5000.0 NaN 7.0
Product Maintenance Monitor Software
Manager Rep
Debra Henley Craig Booker 2.0 NaN 1.0
Daniel Hilton NaN NaN 1.0
John Smith 2.0 NaN NaN
Fred Anderson Cedric Moss 1.0 NaN 1.0
Wendy Yule 3.0 2.0 NaN
# 对没有的数据赋值
df_pivot = pd.pivot_table(df, index=['Manager','Rep'], values=['Price','Quantity'], columns=['Product'],fill_value=0,aggfunc='sum')
左右滑动查看更多
分组和透视功能实战
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
# 打开美国航空公司数据
f = open('usa_flights.csv')
df = pd.read_csv(f)
df.head()
Out[6]:
flight_date unique_carrier flight_num origin dest arr_delay \
0 02/01/2015 0:00 AA 1 JFK LAX -19.0
1 03/01/2015 0:00 AA 1 JFK LAX -39.0
2 04/01/2015 0:00 AA 1 JFK LAX -12.0
3 05/01/2015 0:00 AA 1 JFK LAX -8.0
4 06/01/2015 0:00 AA 1 JFK LAX 25.0
cancelled distance carrier_delay weather_delay late_aircraft_delay \
0 0 2475 NaN NaN NaN
1 0 2475 NaN NaN NaN
2 0 2475 NaN NaN NaN
3 0 2475 NaN NaN NaN
4 0 2475 0.0 0.0 0.0
nas_delay security_delay actual_elapsed_time
0 NaN NaN 381.0
1 NaN NaN 358.0
2 NaN NaN 385.0
3 NaN NaN 389.0
4 25.0 0.0 424.0
# 对延误时间进行排序
df.sort_values('arr_delay', ascending=False).head()
Out[7]:
flight_date unique_carrier flight_num origin dest arr_delay \
11073 11/01/2015 0:00 AA 1595 AUS DFW 1444.0
10214 13/01/2015 0:00 AA 1487 OMA DFW 1392.0
12430 03/01/2015 0:00 AA 1677 MEM DFW 1384.0
8443 04/01/2015 0:00 AA 1279 OMA DFW 1237.0
10328 05/01/2015 0:00 AA 1495 EGE DFW 1187.0
cancelled distance carrier_delay weather_delay late_aircraft_delay \
11073 0 190 1444.0 0.0 0.0
10214 0 583 1392.0 0.0 0.0
12430 0 432 1380.0 0.0 0.0
8443 0 583 1222.0 0.0 15.0
10328 0 721 1019.0 0.0 168.0
nas_delay security_delay actual_elapsed_time
11073 0.0 0.0 59.0
10214 0.0 0.0 117.0
12430 4.0 0.0 104.0
8443 0.0 0.0 102.0
10328 0.0 0.0 127.0
#延误时间top10
df.sort_values('arr_delay')[:10]
# 计算延误和没有延误的比例
# 使用value_counts()进行统计
df['cancelled'].value_counts()
Out[9]:
0 196873
1 4791
Name: cancelled, dtype: int64
# 计算延误列
df['delayed'] = df['arr_delay'].apply(lambda x : x>0)
# 统计
df['delayed'].value_counts()
Out[12]:
False 103037
True 98627
Name: delayed, dtype: int64
#计算延误比
delay_date = df['delayed'].value_counts()
delay_date
Out[16]:
False 103037
True 98627
Name: delayed, dtype: int64
delay_date[1]/(delay_date[0]+delay_date[1])
Out[18]: 0.48906597112027927
# 计算每个航空公司的延误比
# 对公司和是否延误进行分组
delay_group = df.groupby(['unique_carrier', 'delayed'])
# 统计查看行数,生成一个多级index,Series
delay_group.size()
Out[23]:
unique_carrier delayed
AA False 8912
True 9841
AS False 3527
True 2104
B6 False 4832
True 4401
DL False 17719
True 9803
EV False 10596
True 11371
F9 False 1103
True 1848
HA False 1351
True 1354
MQ False 4692
True 8060
NK False 1550
True 2133
OO False 9977
True 10804
UA False 7885
True 8624
US False 7850
True 6353
VX False 1254
True 781
WN False 21789
True 21150
dtype: int64
# 多级Series转换为DataFrame
df_delay = delay_group.size().unstack()
df_delay
Out[25]:
delayed False True
unique_carrier
AA 8912 9841
AS 3527 2104
B6 4832 4401
DL 17719 9803
EV 10596 11371
F9 1103 1848
HA 1351 1354
MQ 4692 8060
NK 1550 2133
OO 9977 10804
UA 7885 8624
US 7850 6353
VX 1254 781
WN 21789 21150
# 画图
# 折线图
df_delay.plot()
Out[26]: <matplotlib.axes._subplots.AxesSubplot at 0x1cfde9b6198>
# 柱状图
df_delay.plot(kind='barh', stacked=True, figsize=[16,6], colormap='winter')
Out[27]: <matplotlib.axes._subplots.AxesSubplot at 0x1cfe483b048>
# 透视表
# 对时间和航空公司进行透视
fights_by_carrier = df.pivot_table(index='flight_date', columns='unique_carrier', aggfunc='count')
fights_by_carrier.head()
Out[33]:
actual_elapsed_time \
unique_carrier AA AS B6 DL EV F9 HA MQ NK
flight_date
02/01/2015 0:00 1508 477 758 2261 1763 252 223 975 285
03/01/2015 0:00 1425 444 707 2021 1668 180 202 834 282
04/01/2015 0:00 1493 458 752 2254 1709 241 206 845 283
05/01/2015 0:00 1492 433 750 2211 1751 248 207 856 282
06/01/2015 0:00 1370 414 682 2050 1616 246 202 859 275
weather_delay \
unique_carrier OO ... EV F9 HA MQ NK OO UA US
flight_date ...
02/01/2015 0:00 1718 ... 425 90 119 441 94 506 415 175
03/01/2015 0:00 1588 ... 909 110 111 542 152 720 648 339
04/01/2015 0:00 1653 ... 902 171 25 638 166 729 659 480
05/01/2015 0:00 1644 ... 743 138 28 594 163 627 443 291
06/01/2015 0:00 1469 ... 597 90 12 489 149 426 499 424
·END·
点击阅读原文,进入新型农业经营主体大数据库
往期推荐
数据Seminar
这里是大数据、分析技术与学术研究的三叉路口
出处:CSDN作者:向前走别回头推荐:青酱排版编辑:青酱
欢迎扫描👇二维码添加关注
点击阅读原文,获得更多精彩内容!