其他
软件应用丨Pandas玩转数据进阶:(二)
版权声明:本文为CSDN博主「向前走别回头」的原创文章合辑,遵循 CC 4.0 BY-SA 版权协议,特此附上原文出处链接及本声明。
原文链接:
https://blog.csdn.net/weixin_39778570/article/details/81114177
https://blog.csdn.net/weixin_39778570/article/details/81114746
https://blog.csdn.net/weixin_39778570/article/details/81115523
https://blog.csdn.net/weixin_39778570/article/details/81116510
点此回顾 软件应用丨Pandas玩转数据进阶:(一)
通过apply对数据进行处理
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
# 打开一个csv文件对其中一列分成几列,把data列进行分组
f = open('apply_demo.csv')
df = pd.read_csv(f)
df.head()
Out[8]:
time data
0 1473411962 Symbol: APPL Seqno: 0 Price: 1623
1 1473411962 Symbol: APPL Seqno: 0 Price: 1623
2 1473411963 Symbol: APPL Seqno: 0 Price: 1623
3 1473411963 Symbol: APPL Seqno: 0 Price: 1623
4 1473411963 Symbol: APPL Seqno: 1 Price: 1649
# 简单的apply应用
s1 = Series(['a']*7978)
df['A'] = s1
df.head()
Out[12]:
time data A
0 1473411962 Symbol: APPL Seqno: 0 Price: 1623 a
1 1473411962 Symbol: APPL Seqno: 0 Price: 1623 a
2 1473411963 Symbol: APPL Seqno: 0 Price: 1623 a
3 1473411963 Symbol: APPL Seqno: 0 Price: 1623 a
4 1473411963 Symbol: APPL Seqno: 1 Price: 1649 a
# apply传如一个函数,对某一列进行处理
df['A'] = df['A'].apply(str.upper)
df.head()
Out[14]:
time data A
0 1473411962 Symbol: APPL Seqno: 0 Price: 1623 A
1 1473411962 Symbol: APPL Seqno: 0 Price: 1623 A
2 1473411963 Symbol: APPL Seqno: 0 Price: 1623 A
3 1473411963 Symbol: APPL Seqno: 0 Price: 1623 A
4 1473411963 Symbol: APPL Seqno: 1 Price: 1649 A
# 通过分析,我们需要的是如下数据
df['data'][0]
Out[15]: ' Symbol: APPL Seqno: 0 Price: 1623'
df['data'][0].strip().split(' ')
Out[16]: ['Symbol:', 'APPL', 'Seqno:', '0', 'Price:', '1623']
l1 = df['data'][0].strip().split(' ')
l1[1], l1[3], l1[5]
Out[18]: ('APPL', '0', '1623')
# 定义一个处理函数
def foo(line):
items = line.strip().split(' ')
return Series([items[1], items[3], items[5]])
# 使用apply处理data列
df_temp = df['data'].apply(foo)
df_temp.head()
Out[21]:
0 1 2
0 APPL 0 1623
1 APPL 0 1623
2 APPL 0 1623
3 APPL 0 1623
4 APPL 1 1649
# 修改列名
df_temp = df_temp.rename(columns = {0:'Symbol', 1:'Seqno', 2:'Price'})
df_temp.head()
Out[28]:
Symbol Seqno Price
0 APPL 0 1623
1 APPL 0 1623
2 APPL 0 1623
3 APPL 0 1623
4 APPL 1 1649
# 添加到原df中
df_new = df.combine_first(df_temp)
df_new.head()
Out[30]:
A Price Seqno Symbol data time
0 A 1623.0 0.0 APPL Symbol: APPL Seqno: 0 Price: 1623 1473411962
1 A 1623.0 0.0 APPL Symbol: APPL Seqno: 0 Price: 1623 1473411962
2 A 1623.0 0.0 APPL Symbol: APPL Seqno: 0 Price: 1623 1473411963
3 A 1623.0 0.0 APPL Symbol: APPL Seqno: 0 Price: 1623 1473411963
4 A 1649.0 1.0 APPL Symbol: APPL Seqno: 1 Price: 1649 1473411963
# 删除掉无关列并生成csv
del df_new['data'], df_new[ 'A']
df_new.head()
Out[33]:
Price Seqno Symbol time
0 1623.0 0.0 APPL 1473411962
1 1623.0 0.0 APPL 1473411962
2 1623.0 0.0 APPL 1473411963
3 1623.0 0.0 APPL 1473411963
4 1649.0 1.0 APPL 1473411963
df_new.to_csv('demo_duplicate.csv')
左右滑动查看更多
Series和DataFrame去重
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
df = pd.read_csv('demo_duplicate.csv')
df.head()
Out[40]:
Unnamed: 0 Price Seqno Symbol time
0 0 1623.0 0.0 APPL 1473411962
1 1 1623.0 0.0 APPL 1473411962
2 2 1623.0 0.0 APPL 1473411963
3 3 1623.0 0.0 APPL 1473411963
4 4 1649.0 1.0 APPL 1473411963
#
df.size
Out[41]: 19945
len(df)
Out[42]: 3989
len(df['Seqno'].unique())
Out[46]: 1000
# 某一行Series的duplicate
df['Seqno'].duplicated().head()
Out[47]:
0 False
1 True
2 True
3 True
4 False
Name: Seqno, dtype: bool
type(df['Seqno'].duplicated())
Out[48]: pandas.core.series.Series
# DataFrame去重,drop_dupliates
df.drop_duplicates(['Seqno']).head()
Out[49]:
Unnamed: 0 Price Seqno Symbol time
0 0 1623.0 0.0 APPL 1473411962
4 4 1649.0 1.0 APPL 1473411963
8 8 1642.0 2.0 APPL 1473411964
12 12 1636.0 3.0 APPL 1473411965
16 16 1669.0 4.0 APPL 1473411966
# keep参数指定保留哪一个
df.drop_duplicates(['Seqno'], keep='last').head()
Out[53]:
Unnamed: 0 Price Seqno Symbol time
3 3 1623.0 0.0 APPL 1473411963
7 7 1649.0 1.0 APPL 1473411964
11 11 1642.0 2.0 APPL 1473411965
15 15 1636.0 3.0 APPL 1473411966
19 19 1669.0 4.0 APPL 1473411967
左右滑动查看更多
时间序列简单操作
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from datetime import datetime
# 生成一个时间
t1 = datetime(2009, 10, 20)
t1
Out[7]: datetime.datetime(2009, 10, 20, 0, 0)
# 手写一个时间列表
date_list = [datetime(2016,9,1), datetime(2016,9,10), datetime(2017,9,1),datetime(2017,9,20),datetime(2017,10,1)]
date_list
Out[12]:
[datetime.datetime(2016, 9, 1, 0, 0),
datetime.datetime(2016, 9, 10, 0, 0),
datetime.datetime(2017, 9, 1, 0, 0),
datetime.datetime(2017, 9, 20, 0, 0),
datetime.datetime(2017, 10, 1, 0, 0)]
# Series是index改为date_list
s1 = Series(np.random.rand(5), index=date_list)
s1
Out[14]:
2016-09-01 0.091639
2016-09-10 0.823207
2017-09-01 0.082705
2017-09-20 0.162824
2017-10-01 0.379725
dtype: float64
s1.values
Out[15]: array([ 0.09163872, 0.82320691, 0.08270518, 0.16282401, 0.37972535])
# index 是一个DatetimeIndex
s1.index
Out[16]:
DatetimeIndex(['2016-09-01', '2016-09-10', '2017-09-01', '2017-09-20',
'2017-10-01'],
dtype='datetime64[ns]', freq=None)
# 对Series访问
s1[datetime(2016,9,10)]
Out[18]: 0.82320691034095983
s1['2016-9-10']
Out[19]: 0.82320691034095983
s1['2016/9/10']
Out[20]: 0.82320691034095983
s1['20160910']
Out[21]: 0.82320691034095983
# 不可以直接'201609',报错
s1['2016-09']
Out[22]:
2016-09-01 0.091639
2016-09-10 0.823207
dtype: float64
s1['2016']
Out[23]:
2016-09-01 0.091639
2016-09-10 0.823207
dtype: float64
# pandas生成Datetime,一个开始参数,一个结束参数,periods表示数量,freq表示间隔
# 生成100个,freq默认为'D'天数
date_list_new = pd.date_range('2016-01-01', periods=100)
date_list_new
Out[29]:
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
'2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',
'2016-01-09', '2016-01-10', '2016-01-11', '2016-01-12',
'2016-01-13', '2016-01-14', '2016-01-15', '2016-01-16',
'2016-01-17', '2016-01-18', '2016-01-19', '2016-01-20',
'2016-01-21', '2016-01-22', '2016-01-23', '2016-01-24',
'2016-01-25', '2016-01-26', '2016-01-27', '2016-01-28',
'2016-01-29', '2016-01-30', '2016-01-31', '2016-02-01',
'2016-02-02', '2016-02-03', '2016-02-04', '2016-02-05',
'2016-02-06', '2016-02-07', '2016-02-08', '2016-02-09',
'2016-02-10', '2016-02-11', '2016-02-12', '2016-02-13',
'2016-02-14', '2016-02-15', '2016-02-16', '2016-02-17',
'2016-02-18', '2016-02-19', '2016-02-20', '2016-02-21',
'2016-02-22', '2016-02-23', '2016-02-24', '2016-02-25',
'2016-02-26', '2016-02-27', '2016-02-28', '2016-02-29',
'2016-03-01', '2016-03-02', '2016-03-03', '2016-03-04',
'2016-03-05', '2016-03-06', '2016-03-07', '2016-03-08',
'2016-03-09', '2016-03-10', '2016-03-11', '2016-03-12',
'2016-03-13', '2016-03-14', '2016-03-15', '2016-03-16',
'2016-03-17', '2016-03-18', '2016-03-19', '2016-03-20',
'2016-03-21', '2016-03-22', '2016-03-23', '2016-03-24',
'2016-03-25', '2016-03-26', '2016-03-27', '2016-03-28',
'2016-03-29', '2016-03-30', '2016-03-31', '2016-04-01',
'2016-04-02', '2016-04-03', '2016-04-04', '2016-04-05',
'2016-04-06', '2016-04-07', '2016-04-08', '2016-04-09'],
dtype='datetime64[ns]', freq='D')
# 修改freq,每周一
date_list_new = pd.date_range('2016-01-01', periods=100, freq='W-MON')
date_list_new
Out[31]:
DatetimeIndex(['2016-01-04', '2016-01-11', '2016-01-18', '2016-01-25',
'2016-02-01', '2016-02-08', '2016-02-15', '2016-02-22',
'2016-02-29', '2016-03-07', '2016-03-14', '2016-03-21',
'2016-03-28', '2016-04-04', '2016-04-11', '2016-04-18',
'2016-04-25', '2016-05-02', '2016-05-09', '2016-05-16',
'2016-05-23', '2016-05-30', '2016-06-06', '2016-06-13',
'2016-06-20', '2016-06-27', '2016-07-04', '2016-07-11',
'2016-07-18', '2016-07-25', '2016-08-01', '2016-08-08',
'2016-08-15', '2016-08-22', '2016-08-29', '2016-09-05',
'2016-09-12', '2016-09-19', '2016-09-26', '2016-10-03',
'2016-10-10', '2016-10-17', '2016-10-24', '2016-10-31',
'2016-11-07', '2016-11-14', '2016-11-21', '2016-11-28',
'2016-12-05', '2016-12-12', '2016-12-19', '2016-12-26',
'2017-01-02', '2017-01-09', '2017-01-16', '2017-01-23',
'2017-01-30', '2017-02-06', '2017-02-13', '2017-02-20',
'2017-02-27', '2017-03-06', '2017-03-13', '2017-03-20',
'2017-03-27', '2017-04-03', '2017-04-10', '2017-04-17',
'2017-04-24', '2017-05-01', '2017-05-08', '2017-05-15',
'2017-05-22', '2017-05-29', '2017-06-05', '2017-06-12',
'2017-06-19', '2017-06-26', '2017-07-03', '2017-07-10',
'2017-07-17', '2017-07-24', '2017-07-31', '2017-08-07',
'2017-08-14', '2017-08-21', '2017-08-28', '2017-09-04',
'2017-09-11', '2017-09-18', '2017-09-25', '2017-10-02',
'2017-10-09', '2017-10-16', '2017-10-23', '2017-10-30',
'2017-11-06', '2017-11-13', '2017-11-20', '2017-11-27'],
dtype='datetime64[ns]', freq='W-MON')
# 间隔5小时
date_list_new = pd.date_range('2016-01-01','2016-02-01', freq='5H')
date_list_new
Out[36]:
DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 05:00:00',
'2016-01-01 10:00:00', '2016-01-01 15:00:00',
'2016-01-01 20:00:00', '2016-01-02 01:00:00',
'2016-01-02 06:00:00', '2016-01-02 11:00:00',
'2016-01-02 16:00:00', '2016-01-02 21:00:00',
...
'2016-01-29 23:00:00', '2016-01-30 04:00:00',
'2016-01-30 09:00:00', '2016-01-30 14:00:00',
'2016-01-30 19:00:00', '2016-01-31 00:00:00',
'2016-01-31 05:00:00', '2016-01-31 10:00:00',
'2016-01-31 15:00:00', '2016-01-31 20:00:00'],
dtype='datetime64[ns]', length=149, freq='5H')
# Series
s2=Series(np.random.rand(100), index=date_list_new)
s2.head()
Out[40]:
2016-01-01 00:00:00 0.298580
2016-01-01 05:00:00 0.145782
2016-01-01 10:00:00 0.356871
2016-01-01 15:00:00 0.607690
2016-01-01 20:00:00 0.223771
Freq: 5H, dtype: float64
左右滑动查看更多
时间序列的采样和画图
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
# 生成一个时间序列
t_range = pd.date_range('2016-01-01', '2016-12-31')
t_range
Out[5]:
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
'2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',
'2016-01-09', '2016-01-10',
...
'2016-12-22', '2016-12-23', '2016-12-24', '2016-12-25',
'2016-12-26', '2016-12-27', '2016-12-28', '2016-12-29',
'2016-12-30', '2016-12-31'],
dtype='datetime64[ns]', length=366, freq='D')
# 创建Series
s1 = Series(np.random.randn(len(t_range)), index=t_range)
s1
Out[9]:
2016-01-01 -1.408484
2016-01-02 -0.530784
2016-01-03 0.659089
2016-01-04 1.468151
2016-01-05 0.678399
2016-01-06 -2.446700
2016-01-07 -0.403404
2016-01-08 0.623137
2016-01-09 1.068036
2016-01-10 1.406340
2016-01-11 -0.925950
2016-01-12 0.886542
2016-01-13 0.527620
2016-01-14 0.177987
2016-01-15 -0.843907
2016-01-16 1.271302
2016-01-17 1.646341
2016-01-18 -0.420305
2016-01-19 -1.552205
2016-01-20 -0.884822
2016-01-21 0.660273
2016-01-22 0.945790
2016-01-23 1.698283
2016-01-24 0.668180
2016-01-25 1.470522
2016-01-26 0.687848
2016-01-27 0.033351
2016-01-28 -0.844644
2016-01-29 0.472518
2016-01-30 -0.920086
2016-12-02 0.313375
2016-12-03 0.458618
2016-12-04 0.197696
2016-12-05 1.238550
2016-12-06 2.249532
2016-12-07 1.095712
2016-12-08 0.693674
2016-12-09 -0.377020
2016-12-10 0.532677
2016-12-11 1.714745
2016-12-12 0.124774
2016-12-13 -0.372079
2016-12-14 -0.932541
2016-12-15 -0.320267
2016-12-16 -0.719403
2016-12-17 -2.012314
2016-12-18 -0.510938
2016-12-19 -0.354006
2016-12-20 -0.351626
2016-12-21 -0.653467
2016-12-22 0.169920
2016-12-23 0.588163
2016-12-24 -0.692558
2016-12-25 -0.078781
2016-12-26 -0.157261
2016-12-27 -1.480809
2016-12-28 1.586904
2016-12-29 -0.791816
2016-12-30 0.951799
2016-12-31 1.283303
Freq: D, Length: 366, dtype: float64
# 一月份数据
s1['2016-01']
Out[10]:
2016-01-01 -1.408484
2016-01-02 -0.530784
2016-01-03 0.659089
2016-01-04 1.468151
2016-01-05 0.678399
2016-01-06 -2.446700
2016-01-07 -0.403404
2016-01-08 0.623137
2016-01-09 1.068036
2016-01-10 1.406340
2016-01-11 -0.925950
2016-01-12 0.886542
2016-01-13 0.527620
2016-01-14 0.177987
2016-01-15 -0.843907
2016-01-16 1.271302
2016-01-17 1.646341
2016-01-18 -0.420305
2016-01-19 -1.552205
2016-01-20 -0.884822
2016-01-21 0.660273
2016-01-22 0.945790
2016-01-23 1.698283
2016-01-24 0.668180
2016-01-25 1.470522
2016-01-26 0.687848
2016-01-27 0.033351
2016-01-28 -0.844644
2016-01-29 0.472518
2016-01-30 -0.920086
2016-01-31 -1.070854
Freq: D, dtype: float64
# 一月份取平均值
s1['2016-01'].mean()
Out[11]: 0.15476017406190043
# 对月份取平均值
s1_month = s1.resample('M').mean()
s1_month
Out[13]:
2016-01-31 0.154760
2016-02-29 -0.171572
2016-03-31 -0.127375
2016-04-30 -0.298081
2016-05-31 0.332433
2016-06-30 -0.317580
2016-07-31 -0.239776
2016-08-31 -0.020108
2016-09-30 -0.276503
2016-10-31 -0.121938
2016-11-30 -0.025510
2016-12-31 0.088271
Freq: M, dtype: float64
# 前填充ffill,比如一号里的没有的数据是从1月1号采取过来的
s1.resample('H').ffill()
Out[14]:
2016-01-01 00:00:00 -1.408484
2016-01-01 01:00:00 -1.408484
2016-01-01 02:00:00 -1.408484
2016-01-01 03:00:00 -1.408484
2016-01-01 04:00:00 -1.408484
2016-01-01 05:00:00 -1.408484
2016-01-01 06:00:00 -1.408484
2016-01-01 07:00:00 -1.408484
2016-01-01 08:00:00 -1.408484
2016-01-01 09:00:00 -1.408484
2016-01-01 10:00:00 -1.408484
2016-01-01 11:00:00 -1.408484
2016-01-01 12:00:00 -1.408484
2016-01-01 13:00:00 -1.408484
2016-01-01 14:00:00 -1.408484
2016-01-01 15:00:00 -1.408484
2016-01-01 16:00:00 -1.408484
2016-01-01 17:00:00 -1.408484
2016-01-01 18:00:00 -1.408484
2016-01-01 19:00:00 -1.408484
2016-01-01 20:00:00 -1.408484
2016-01-01 21:00:00 -1.408484
2016-01-01 22:00:00 -1.408484
2016-01-01 23:00:00 -1.408484
2016-01-02 00:00:00 -0.530784
2016-01-02 01:00:00 -0.530784
2016-01-02 02:00:00 -0.530784
2016-01-02 03:00:00 -0.530784
2016-01-02 04:00:00 -0.530784
2016-01-02 05:00:00 -0.530784
2016-12-29 19:00:00 -0.791816
2016-12-29 20:00:00 -0.791816
2016-12-29 21:00:00 -0.791816
2016-12-29 22:00:00 -0.791816
2016-12-29 23:00:00 -0.791816
2016-12-30 00:00:00 0.951799
2016-12-30 01:00:00 0.951799
2016-12-30 02:00:00 0.951799
2016-12-30 03:00:00 0.951799
2016-12-30 04:00:00 0.951799
2016-12-30 05:00:00 0.951799
2016-12-30 06:00:00 0.951799
2016-12-30 07:00:00 0.951799
2016-12-30 08:00:00 0.951799
2016-12-30 09:00:00 0.951799
2016-12-30 10:00:00 0.951799
2016-12-30 11:00:00 0.951799
2016-12-30 12:00:00 0.951799
2016-12-30 13:00:00 0.951799
2016-12-30 14:00:00 0.951799
2016-12-30 15:00:00 0.951799
2016-12-30 16:00:00 0.951799
2016-12-30 17:00:00 0.951799
2016-12-30 18:00:00 0.951799
2016-12-30 19:00:00 0.951799
2016-12-30 20:00:00 0.951799
2016-12-30 21:00:00 0.951799
2016-12-30 22:00:00 0.951799
2016-12-30 23:00:00 0.951799
2016-12-31 00:00:00 1.283303
Freq: H, Length: 8761, dtype: float64
# 后填充bfill,比如一号里的没有的数据是从1月2号采取过来的
s1.resample('H').bfill()
Out[15]:
2016-01-01 00:00:00 -1.408484
2016-01-01 01:00:00 -0.530784
2016-01-01 02:00:00 -0.530784
2016-01-01 03:00:00 -0.530784
2016-01-01 04:00:00 -0.530784
2016-01-01 05:00:00 -0.530784
2016-01-01 06:00:00 -0.530784
2016-01-01 07:00:00 -0.530784
2016-01-01 08:00:00 -0.530784
2016-01-01 09:00:00 -0.530784
2016-01-01 10:00:00 -0.530784
2016-01-01 11:00:00 -0.530784
2016-01-01 12:00:00 -0.530784
2016-01-01 13:00:00 -0.530784
2016-01-01 14:00:00 -0.530784
2016-01-01 15:00:00 -0.530784
2016-01-01 16:00:00 -0.530784
2016-01-01 17:00:00 -0.530784
2016-01-01 18:00:00 -0.530784
2016-01-01 19:00:00 -0.530784
2016-01-01 20:00:00 -0.530784
2016-01-01 21:00:00 -0.530784
2016-01-01 22:00:00 -0.530784
2016-01-01 23:00:00 -0.530784
2016-01-02 00:00:00 -0.530784
2016-01-02 01:00:00 0.659089
2016-01-02 02:00:00 0.659089
2016-01-02 03:00:00 0.659089
2016-01-02 04:00:00 0.659089
2016-01-02 05:00:00 0.659089
2016-12-29 19:00:00 0.951799
2016-12-29 20:00:00 0.951799
2016-12-29 21:00:00 0.951799
2016-12-29 22:00:00 0.951799
2016-12-29 23:00:00 0.951799
2016-12-30 00:00:00 0.951799
2016-12-30 01:00:00 1.283303
2016-12-30 02:00:00 1.283303
2016-12-30 03:00:00 1.283303
2016-12-30 04:00:00 1.283303
2016-12-30 05:00:00 1.283303
2016-12-30 06:00:00 1.283303
2016-12-30 07:00:00 1.283303
2016-12-30 08:00:00 1.283303
2016-12-30 09:00:00 1.283303
2016-12-30 10:00:00 1.283303
2016-12-30 11:00:00 1.283303
2016-12-30 12:00:00 1.283303
2016-12-30 13:00:00 1.283303
2016-12-30 14:00:00 1.283303
2016-12-30 15:00:00 1.283303
2016-12-30 16:00:00 1.283303
2016-12-30 17:00:00 1.283303
2016-12-30 18:00:00 1.283303
2016-12-30 19:00:00 1.283303
2016-12-30 20:00:00 1.283303
2016-12-30 21:00:00 1.283303
2016-12-30 22:00:00 1.283303
2016-12-30 23:00:00 1.283303
2016-12-31 00:00:00 1.283303
Freq: H, Length: 8761, dtype: float64
# 画图
# 新建一个时间序列
t_range = pd.date_range('2016-01-01', '2016-12-31', freq='H')
t_range
Out[17]:
DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00',
'2016-01-01 02:00:00', '2016-01-01 03:00:00',
'2016-01-01 04:00:00', '2016-01-01 05:00:00',
'2016-01-01 06:00:00', '2016-01-01 07:00:00',
'2016-01-01 08:00:00', '2016-01-01 09:00:00',
...
'2016-12-30 15:00:00', '2016-12-30 16:00:00',
'2016-12-30 17:00:00', '2016-12-30 18:00:00',
'2016-12-30 19:00:00', '2016-12-30 20:00:00',
'2016-12-30 21:00:00', '2016-12-30 22:00:00',
'2016-12-30 23:00:00', '2016-12-31 00:00:00'],
dtype='datetime64[ns]', length=8761, freq='H')
# 建立一个DataFrame
stock_df = DataFrame(index=t_range)
# 加入两行,模拟股票
stock_df['BABA'] = np.random.randint(80, 100, size=len(t_range))
stock_df['TENCENT'] = np.random.randint(30, 50, size=len(t_range))
# 显示图片,下面
stock_df.plot()
Out[22]: <matplotlib.axes._subplots.AxesSubplot at 0x2259b52fa90>
# 数据过于密集,重新清洗,按周清洗
weekly_df = DataFrame()
# resample参数w表示周分类,再取平均值
weekly_df['BABA'] = stock_df['BABA'].resample('W').mean()
weekly_df['TENCENT'] = stock_df['TENCENT'].resample('W').mean()
weekly_df.head()
Out[31]:
BABA TENCENT
2016-01-03 89.250000 39.430556
2016-01-10 89.065476 38.595238
2016-01-17 89.363095 39.410714
2016-01-24 89.482143 38.857143
2016-01-31 89.869048 40.309524
# 显示
weekly_df.plot()
Out[32]: <matplotlib.axes._subplots.AxesSubplot at 0x2259bf84e48>
左右滑动查看更多
·END·
点击阅读原文,进入新型农业经营主体大数据库
软件应用丨Pandas入门系列(三):简单数据处理
软件应用丨Pandas入门系列(二):Pandas io操作
软件应用丨Pandas入门系列(一):深入理解Series和DataFrame
数据Seminar
这里是大数据、分析技术与学术研究的三叉路口
出处:CSDN作者:向前走别回头推荐:青酱排版编辑:青酱
欢迎扫描👇二维码添加关注
点击阅读原文,获得更多精彩内容!