软件应用丨Pandas入门系列(一):深入理解Series和DataFrame
版权声明:本文为CSDN博主「向前走别回头」的原创文章,遵循 CC 4.0 BY-SA 版权协议,在文中已附上链接。
Series的创建
# 使用列表创建
>>> import numpy as np
>>> import pandas as pd
>>> s1 = pd.Series([1,2,3,4])
>>> s1
0 1
1 2
2 3
3 4
dtype: int64
# 查看s1的值和索引
>>> s1.values
array([1, 2, 3, 4], dtype=int64)
>>> s1.index
RangeIndex(start=0, stop=4, step=1) # 默认索引
# 使用数组创建
>>> s2 = pd.Series(np.arange(10))
>>> s2
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int32
# 使用字典创建
>>> s3 = pd.Series({'1':1, '2':2, '3':3})
>>> s3
1 1
2 2
3 3
dtype: int64
>>> s3.values
array([1, 2, 3], dtype=int64)
>>> s3.index
Index(['1', '2', '3'], dtype='object')
Series的访问
>>> s4 = pd.Series([1,2,3,4], index = ['a','b','c','d'])
>>> s4
a 1
b 2
c 3
d 4
dtype: int64
>>> s4.values
array([1, 2, 3, 4], dtype=int64)
>>> s4.index
Index(['a', 'b', 'c', 'd'], dtype='object')
>>> s4['a'] # 访问索引为a的值
1
>>> s4[s4>2] #访问s4中值大于2的Series
c 3
d 4
dtype: int64
# Series与字典的转换
>>> s4.to_dict() # s4转换为字典
{'a': 1, 'b': 2, 'c': 3, 'd': 4}
>>> s5 = pd.Series(s4.to_dict()) # 字典转换为Series
>>> s5
a 1
b 2
c 3
d 4
dtype: int64
# e索引无值补充为NaN
>>> index_1 = ['a','b','c','d','e']
>>> s6 = pd.Series(s5, index = index_1)
>>> s6
a 1.0
b 2.0
c 3.0
d 4.0
e NaN # s5此处无值
dtype: float64
# NaN判断
>>> pd.isnull(s6)
a False
b False
c False
d False
e True
dtype: bool
>>> pd.notnull(s6)
a True
b True
c True
d True
e False
dtype: bool
# 命名修改
>>> s6.name = 'demo' # s6的名字修改
>>> s6
a 1.0
b 2.0
c 3.0
d 4.0
e NaN
Name: demo, dtype: float64
>>> s6.index.name = 'demo_index' # s6的索引的名字的修改
>>> s6.index
Index(['a', 'b', 'c', 'd', 'e'], dtype='object', name='demo_index')
官网:
http://pandas.pydata.org/pandas-docs/version/0.14.1/
原文链接:
DataFrame数据框
# 巧用复制黏贴
>>> import numpy as np
>>> import pandas as pd
>>> from pandas import Series, DataFrame
>>> import webbrowser
>>> link = 'http://www.tiobe.com/tiobe-index/'
>>> webbrowser.open(link) # 打开一个网页 然后把要生成的数据复制到剪切板,以下选择了编程语言前10行
True
>>> df = pd.read_clipboard() # 执行这段代码自动生存DataFrame对象
>>> df
May 2018 May 2017 Change Programming Language Ratings Change.1
0 1 1 NaN Java 16.380% +1.74%
1 2 2 NaN C 14.000% +7.00%
2 3 3 NaN C++ 7.668% +2.92%
3 4 4 NaN Python 5.192% +1.64%
4 5 5 NaN C# 4.402% +0.95%
5 6 6 NaN Visual Basic .NET 4.124% +0.73%
6 7 9 change PHP 3.321% +0.63%
7 8 7 change JavaScript 2.923% -0.15%
8 9 - change SQL 1.987% +1.99%
9 10 11 change Ruby 1.182% -1.25%
>>> type(df) # 查看类型
<class 'pandas.core.frame.DataFrame'>
# 查看DataFrame的内容
>>> df.columns # 查看所有列
Index(['May 2018', 'May 2017', 'Change', 'Programming Language', 'Ratings',
'Change.1'],
dtype='object')
>>> df.Ratings # 查看Ratings这一列
0 16.380%
1 14.000%
2 7.668%
3 5.192%
4 4.402%
5 4.124%
6 3.321%
7 2.923%
8 1.987%
9 1.182%
Name: Ratings, dtype: object
# 某一列的类型为时间序列
>>> type(df['May 2018'])
<class 'pandas.core.series.Series'>
# 从df中提取指定的列
>>> df_new = DataFrame(df, columns = ['May 2018', 'Change.1']) # 从df中提取两列生成新的DataFrame
>>> df_new
May 2018 Change.1
0 1 +1.74%
1 2 +7.00%
2 3 +2.92%
3 4 +1.64%
4 5 +0.95%
5 6 +0.73%
6 7 +0.63%
7 8 -0.15%
8 9 +1.99%
9 10 -1.25%
# 如果列里面多加了不存在的 会自动赋值为空
>>> df_new = DataFrame(df, columns=['May 2018', 'Change', 'Sep 2019'])
>>> df_new
May 2018 Change Sep 2019 # Sep 2019 这一列是不存在的
0 1 NaN NaN
1 2 NaN NaN
2 3 NaN NaN
3 4 NaN NaN
4 5 NaN NaN
5 6 NaN NaN
6 7 change NaN
7 8 change NaN
8 9 change NaN
9 10 change NaN
# 列的赋值
# 对序列进行赋值,使用range函数
>>> df_new['Sep 2019'] = range(0,10) # 赋值 0-9 这10个数给 Sep 2019 这一列
>>> df_new
May 2018 Change Sep 2019
0 1 NaN 0
1 2 NaN 1
2 3 NaN 2
3 4 NaN 3
4 5 NaN 4
5 6 NaN 5
6 7 change 6
7 8 change 7
8 9 change 8
9 10 change 9
# 使用np下的arange(数组)函数
>>> df_new['Sep 2019'] = np.arange(1,11)
>>> df_new
May 2018 Change Sep 2019
0 1 NaN 1
1 2 NaN 2
2 3 NaN 3
3 4 NaN 4
4 5 NaN 5
5 6 NaN 6
6 7 change 7
7 8 change 8
8 9 change 9
9 10 change 10
# 使用序列修改
>>> df_new['Sep 2019'] = pd.Series(np.arange(2,12))
>>> df_new
May 2018 Change Sep 2019
0 1 NaN 2
1 2 NaN 3
2 3 NaN 4
3 4 NaN 5
4 5 NaN 6
5 6 NaN 7
6 7 change 8
7 8 change 9
8 9 change 10
9 10 change 11
# 对单数某一列下的某些行进行赋值
>>> df_new['Sep 2019'] = pd.Series([100,200], index=[1,2])
>>> df_new
May 2018 Change Sep 2019
0 1 NaN NaN
1 2 NaN 100.0
2 3 NaN 200.0
3 4 NaN NaN
4 5 NaN NaN
5 6 NaN NaN
6 7 change NaN
7 8 change NaN
8 9 change NaN
http://pandas.pydata.org/pandas-docs/version/0.14.1/
DataFrame和Series
# 导入需要的模块
>>> import pandas as pd
>>> import numpy as np
>>> from pandas import Series, DataFrame
# 首先创建一个字典
>>> data = {'Student':['XiaoMing','XiaoHong','XiaoWang'],'Grade':[100,90,20],'Class':['RG1','RG2','RG3']}
# 创建一个Series对象
>>> s1 = pd.Series(data['Student'])
>>> s1
0 XiaoMing
1 XiaoHong
2 XiaoWang
dtype: object
# 查看一下这个对象的属性
>>> s1.values
array(['XiaoMing', 'XiaoHong', 'XiaoWang'], dtype=object)
# 当没有给索引赋值的时候默认为
>>> s1.index
RangeIndex(start=0, stop=3, step=1)
# 创建并修改默认索引
>>> s1 = pd.Series(data['Student'], index = ['first','second','three'])
>>> s1
first XiaoMing
second XiaoHong
three XiaoWang
dtype: object
# 给它起个列名‘GDUF’
>>> s1 = pd.Series(data['Student'], index = ['first','second','three'], name = 'GDUF')
>>> s1
first XiaoMing
second XiaoHong
three XiaoWang
Name: GDUF, dtype: object
# 注意:索引和列名是可以修改的,方式如下:
>>> s1.name = 'haha'
>>> s1.index = ['first','second','three']
# 使用字典创建DataFrame对象
>>> df1 = pd.DataFrame(data)
>>> df1
Student Grade Class
0 XiaoMing 100 RG1
1 XiaoHong 90 RG2
2 XiaoWang 20 RG3
# 查看DataFrame的属性
# 某一列
>>> cou = df1['Student']
>>> cou
0 XiaoMing
1 XiaoHong
2 XiaoWang
Name: Student, dtype: object # Series是有名字的
# 列的类型为Series
>>> type(cou)
<class 'pandas.core.series.Series'>
# 默认索引
>>> df1.index
RangeIndex(start=0, stop=3, step=1)
# 查看某一列某一行
>>> c1 = df1['Student'][0]
>>> c1
'XiaoMing'
>>> type(c1)
<class 'str'>
# 查看某一行的属性
>>> for row in df1.iterrows():
print(row), print(type(row)),print(type(row[0])), print(type(row[1]))
break
(0, Student XiaoLi
Grade 100
Class RG1
Name: 0, dtype: object) # 每一行默认的名字为0
<class 'tuple'> # 每一行为一个tuple对象
<class 'int'> # 默认索引为int类型
<class 'pandas.core.series.Series'> # 值为Series类型
(None, None, None, None)
# 通过Series构建DataFrame对象
>>> data
{'Student': ['XiaoMing', 'XiaoHong', 'XiaoWang'], 'Grade': [100, 90, 20], 'Class': ['RG1', 'RG2', 'RG3']}
>>> s1 = pd.Series(data['Student'])
>>> s2 = pd.Series(data['Grade'])
>>> s3 = pd.Series(data['Class'])
>>> df_new = pd.DataFrame([s1,s2,s3]) # 当以列表的形式构建的时候会按行来放,有时候可以优先选择按行放
>>> df_new
0 1 2
0 XiaoMing XiaoHong XiaoWang
1 100 90 20
2 RG1 RG2 RG3
>>> df_new.T # 进行行列转置
0 1 2
0 XiaoMing 100 RG1
1 XiaoHong 90 RG2
2 XiaoWang 20 RG3
# 修改索引(通过字典直接构建,修改索引)
>>> df_new = DataFrame(data)
>>> df_new
Student Grade Class
0 XiaoMing 100 RG1
1 XiaoHong 90 RG2
2 XiaoWang 20 RG3
>>> df_new.index = ['first', 'second', 'thrid']
>>> df_new
Student Grade Class
first XiaoMing 100 RG1
second XiaoHong 90 RG2
thrid XiaoWang 20 RG3
# 看一下这行代码
# 构建Series对象的时候,同时指定索引,名字
>>> s1 = pd.Series(data['Student'], name = 'Student', index = ['one','two','three'])
# 这样子DataFrame的结构就呼之欲出了,即列(colums),索引(index),值(values)
>>> df2 = pd.DataFrame(s1)
>>> df2
Student
one XiaoMing
two XiaoHong
three XiaoWang
>>> s1 = pd.Series(data['Student'], name = 'Student', index = ['one','two','three'])
>>> s2 = pd.Series(data['Grade'], name = 'Grade', index = ['one','two','three'])
>>> s3 = pd.Series(data['Class'], name = 'Class', index = ['one','two','three'])
>>> df2 = pd.DataFrame([s1,s2,s3])
>>> df2
>>> df2
one two three
Student XiaoMing XiaoHong XiaoWang
Grade 100 90 20
Class RG1 RG2 RG3
# 用列表构建是按行放置的,需要装置一下
>>> df2.T
Student Grade Class
one XiaoMing 100 RG1
two XiaoHong 90 RG2
three XiaoWang 20 RG3
# 下面这种也是一种比较直观的构建方式。但是比较繁琐 (字典中的值为Series)
>>> d = {'one': Series([1., 2., 3.], index=['a', 'b', 'c']), 'two': Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
>>> df = DataFrame(d)
>>> df
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 3.0
d NaN 4.0
http://pandas.pydata.org/pandas-docs/version/0.14.1/
·END·
软件应用丨超强干货!Numpy简单使用(入门)
软件应用丨Python数据分析,学习路径拆解及资源推荐(附详细思维导图)
机器学习丨经济学研究中的机器学习:回顾与展望
数据Seminar
这里是大数据、分析技术与学术研究的三叉路口
出处:CSDN作者:向前走别回头推荐:青酱排版编辑:青酱
欢迎扫描👇二维码添加关注