查看原文
其他

软件应用丨Pandas入门系列(一):深入理解Series和DataFrame

向前走别回头 数据Seminar 2021-06-03

版权声明:本文为CSDN博主「向前走别回头」的原创文章,遵循 CC 4.0 BY-SA 版权协议,在文中已附上链接


Series的创建


# 使用列表创建

>>> import numpy as np>>> import pandas as pd>>> s1 = pd.Series([1,2,3,4])>>> s10 11 22 33 4dtype: int64
# 查看s1的值和索引>>> s1.valuesarray([1, 2, 3, 4], dtype=int64)>>> s1.indexRangeIndex(start=0, stop=4, step=1) # 默认索引
左右滑动查看更多

# 使用数组创建

>>> s2 = pd.Series(np.arange(10))>>> s20 01 12 23 34 45 56 67 78 89 9dtype: int32
左右滑动查看更多

# 使用字典创建

>>> s3 = pd.Series({'1':1, '2':2, '3':3})>>> s31 12 23 3dtype: int64>>> s3.valuesarray([1, 2, 3], dtype=int64)>>> s3.indexIndex(['1', '2', '3'], dtype='object')
左右滑动查看更多


Series的访问

>>> s4 = pd.Series([1,2,3,4], index = ['a','b','c','d'])>>> s4a 1b 2c 3d 4dtype: int64>>> s4.valuesarray([1, 2, 3, 4], dtype=int64)>>> s4.indexIndex(['a', 'b', 'c', 'd'], dtype='object')>>> s4['a'] # 访问索引为a的值1>>> s4[s4>2] #访问s4中值大于2的Seriesc 3d 4dtype: int64
左右滑动查看更多

# Series与字典的转换

>>> s4.to_dict() # s4转换为字典{'a': 1, 'b': 2, 'c': 3, 'd': 4}

>>> s5 = pd.Series(s4.to_dict()) # 字典转换为Series>>> s5a 1b 2c 3d 4dtype: int64
左右滑动查看更多

# e索引无值补充为NaN

>>> index_1 = ['a','b','c','d','e']>>> s6 = pd.Series(s5, index = index_1)>>> s6a 1.0b 2.0c 3.0d 4.0e NaN # s5此处无值dtype: float64
左右滑动查看更多

# NaN判断

>>> pd.isnull(s6)a Falseb Falsec Falsed Falsee Truedtype: bool>>> pd.notnull(s6)a Trueb Truec Trued Truee Falsedtype: bool
左右滑动查看更多

# 命名修改

>>> s6.name = 'demo' # s6的名字修改>>> s6a 1.0b 2.0c 3.0d 4.0e NaNName: demo, dtype: float64
>>> s6.index.name = 'demo_index' # s6的索引的名字的修改>>> s6.indexIndex(['a', 'b', 'c', 'd', 'e'], dtype='object', name='demo_index')
左右滑动查看更多

官网:

http://pandas.pydata.org/pandas-docs/version/0.14.1/

原文链接:

https://blog.csdn.net/weixin_39778570/article/details/80550179




DataFrame数据框


# 巧用复制黏贴

>>> import numpy as np>>> import pandas as pd>>> from pandas import Series, DataFrame>>> import webbrowser>>> link = 'http://www.tiobe.com/tiobe-index/'>>> webbrowser.open(link) # 打开一个网页 然后把要生成的数据复制到剪切板,以下选择了编程语言前10行True
>>> df = pd.read_clipboard() # 执行这段代码自动生存DataFrame对象>>> df May 2018 May 2017 Change Programming Language Ratings Change.10 1 1 NaN Java 16.380% +1.74%1 2 2 NaN C 14.000% +7.00%2 3 3 NaN C++ 7.668% +2.92%3 4 4 NaN Python 5.192% +1.64%4 5 5 NaN C# 4.402% +0.95%5 6 6 NaN Visual Basic .NET 4.124% +0.73%6 7 9 change PHP 3.321% +0.63%7 8 7 change JavaScript 2.923% -0.15%8 9 - change SQL 1.987% +1.99%9 10 11 change Ruby 1.182% -1.25%>>> type(df) # 查看类型<class 'pandas.core.frame.DataFrame'>
左右滑动查看更多

# 查看DataFrame的内容

>>> df.columns # 查看所有列Index(['May 2018', 'May 2017', 'Change', 'Programming Language', 'Ratings', 'Change.1'], dtype='object')>>> df.Ratings # 查看Ratings这一列0 16.380%1 14.000%2 7.668%3 5.192%4 4.402%5 4.124%6 3.321%7 2.923%8 1.987%9 1.182%Name: Ratings, dtype: object
# 某一列的类型为时间序列>>> type(df['May 2018'])<class 'pandas.core.series.Series'>
左右滑动查看更多

# 从df中提取指定的列

>>> df_new = DataFrame(df, columns = ['May 2018', 'Change.1']) # 从df中提取两列生成新的DataFrame>>> df_new May 2018 Change.10 1 +1.74%1 2 +7.00%2 3 +2.92%3 4 +1.64%4 5 +0.95%5 6 +0.73%6 7 +0.63%7 8 -0.15%8 9 +1.99%9 10 -1.25%
左右滑动查看更多

# 如果列里面多加了不存在的  会自动赋值为空

>>> df_new = DataFrame(df, columns=['May 2018', 'Change', 'Sep 2019'])>>> df_new May 2018 Change Sep 2019 # Sep 2019 这一列是不存在的0 1 NaN NaN1 2 NaN NaN2 3 NaN NaN3 4 NaN NaN4 5 NaN NaN5 6 NaN NaN6 7 change NaN7 8 change NaN8 9 change NaN9 10 change NaN
左右滑动查看更多


# 列的赋值

# 对序列进行赋值,使用range函数

>>> df_new['Sep 2019'] = range(0,10) # 赋值 0-9 这10个数给 Sep 2019 这一列>>> df_new May 2018 Change Sep 20190 1 NaN 01 2 NaN 12 3 NaN 23 4 NaN 34 5 NaN 45 6 NaN 56 7 change 67 8 change 78 9 change 89 10 change 9
左右滑动查看更多

# 使用np下的arange(数组)函数

>>> df_new['Sep 2019'] = np.arange(1,11)>>> df_new May 2018 Change Sep 20190 1 NaN 11 2 NaN 22 3 NaN 33 4 NaN 44 5 NaN 55 6 NaN 66 7 change 77 8 change 88 9 change 99 10 change 10
左右滑动查看更多

# 使用序列修改

>>> df_new['Sep 2019'] = pd.Series(np.arange(2,12))>>> df_new May 2018 Change Sep 20190 1 NaN 21 2 NaN 32 3 NaN 43 4 NaN 54 5 NaN 65 6 NaN 76 7 change 87 8 change 98 9 change 109 10 change 11
左右滑动查看更多

# 对单数某一列下的某些行进行赋值

>>> df_new['Sep 2019'] = pd.Series([100,200], index=[1,2])>>> df_new May 2018 Change Sep 20190 1 NaN NaN1 2 NaN 100.02 3 NaN 200.03 4 NaN NaN4 5 NaN NaN5 6 NaN NaN6 7 change NaN7 8 change NaN8 9 change NaN
左右滑动查看更多


官网:

http://pandas.pydata.org/pandas-docs/version/0.14.1/

原文链接:
https://blog.csdn.net/weixin_39778570/article/details/80550480




DataFrame和Series


# 导入需要的模块

>>> import pandas as pd>>> import numpy as np>>> from pandas import Series, DataFrame
左右滑动查看更多

# 首先创建一个字典

>>> data = {'Student':['XiaoMing','XiaoHong','XiaoWang'],'Grade':[100,90,20],'Class':['RG1','RG2','RG3']}
左右滑动查看更多

# 创建一个Series对象

>>> s1 = pd.Series(data['Student'])>>> s10 XiaoMing1 XiaoHong2 XiaoWangdtype: object
左右滑动查看更多

# 查看一下这个对象的属性

>>> s1.valuesarray(['XiaoMing', 'XiaoHong', 'XiaoWang'], dtype=object)
左右滑动查看更多

# 当没有给索引赋值的时候默认为

>>> s1.indexRangeIndex(start=0, stop=3, step=1)
左右滑动查看更多

# 创建并修改默认索引

>>> s1 = pd.Series(data['Student'], index = ['first','second','three'])>>> s1first XiaoMingsecond XiaoHongthree XiaoWangdtype: object
左右滑动查看更多

# 给它起个列名‘GDUF’

>>> s1 = pd.Series(data['Student'], index = ['first','second','three'], name = 'GDUF')>>> s1first XiaoMingsecond XiaoHongthree XiaoWangName: GDUF, dtype: object# 注意:索引和列名是可以修改的,方式如下:>>> s1.name = 'haha'>>> s1.index = ['first','second','three']
左右滑动查看更多

# 使用字典创建DataFrame对象

>>> df1 = pd.DataFrame(data)>>> df1 Student Grade Class0 XiaoMing 100 RG11 XiaoHong 90 RG22 XiaoWang 20 RG3
左右滑动查看更多

# 查看DataFrame的属性

# 某一列>>> cou = df1['Student']>>> cou0 XiaoMing1 XiaoHong2 XiaoWangName: Student, dtype: object # Series是有名字的
# 列的类型为Series>>> type(cou)<class 'pandas.core.series.Series'># 默认索引>>> df1.indexRangeIndex(start=0, stop=3, step=1)
# 查看某一列某一行>>> c1 = df1['Student'][0]>>> c1'XiaoMing'>>> type(c1)<class 'str'>
# 查看某一行的属性>>> for row in df1.iterrows(): print(row), print(type(row)),print(type(row[0])), print(type(row[1])) break
(0, Student XiaoLiGrade 100Class RG1Name: 0, dtype: object) # 每一行默认的名字为0<class 'tuple'> # 每一行为一个tuple对象<class 'int'> # 默认索引为int类型<class 'pandas.core.series.Series'> # 值为Series类型(None, None, None, None)
左右滑动查看更多

# 通过Series构建DataFrame对象

>>> data{'Student': ['XiaoMing', 'XiaoHong', 'XiaoWang'], 'Grade': [100, 90, 20], 'Class': ['RG1', 'RG2', 'RG3']}>>> s1 = pd.Series(data['Student'])>>> s2 = pd.Series(data['Grade'])>>> s3 = pd.Series(data['Class'])>>> df_new = pd.DataFrame([s1,s2,s3]) # 当以列表的形式构建的时候会按行来放,有时候可以优先选择按行放 >>> df_new 0 1 20 XiaoMing XiaoHong XiaoWang1 100 90 202 RG1 RG2 RG3>>> df_new.T # 进行行列转置 0 1 20 XiaoMing 100 RG11 XiaoHong 90 RG22 XiaoWang 20 RG3
左右滑动查看更多

# 修改索引(通过字典直接构建,修改索引)

>>> df_new = DataFrame(data)>>> df_new Student Grade Class0 XiaoMing 100 RG11 XiaoHong 90 RG22 XiaoWang 20 RG3>>> df_new.index = ['first', 'second', 'thrid']>>> df_new Student Grade Classfirst XiaoMing 100 RG1second XiaoHong 90 RG2thrid XiaoWang 20 RG3
左右滑动查看更多

# 看一下这行代码

# 构建Series对象的时候,同时指定索引,名字

>>> s1 = pd.Series(data['Student'], name = 'Student', index = ['one','two','three'])
左右滑动查看更多

# 这样子DataFrame的结构就呼之欲出了,即列(colums),索引(index),值(values)

>>> df2 = pd.DataFrame(s1)>>> df2 Studentone XiaoMingtwo XiaoHongthree XiaoWang
>>> s1 = pd.Series(data['Student'], name = 'Student', index = ['one','two','three'])>>> s2 = pd.Series(data['Grade'], name = 'Grade', index = ['one','two','three'])>>> s3 = pd.Series(data['Class'], name = 'Class', index = ['one','two','three'])>>> df2 = pd.DataFrame([s1,s2,s3])>>> df2>>> df2 one two threeStudent XiaoMing XiaoHong XiaoWangGrade 100 90 20Class RG1 RG2 RG3
# 用列表构建是按行放置的,需要装置一下
>>> df2.T Student Grade Classone XiaoMing 100 RG1two XiaoHong 90 RG2three XiaoWang 20 RG3
左右滑动查看更多

# 下面这种也是一种比较直观的构建方式。但是比较繁琐 (字典中的值为Series)

>>> d = {'one': Series([1., 2., 3.], index=['a', 'b', 'c']), 'two': Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}>>> df = DataFrame(d)>>> df one twoa 1.0 1.0b 2.0 2.0c 3.0 3.0d NaN 4.0


官网:

http://pandas.pydata.org/pandas-docs/version/0.14.1/

原文链接:
https://blog.csdn.net/weixin_39778570/article/details/80551032



·END·



点击搜索你感兴趣的内容吧


软件应用丨超强干货!Numpy简单使用(入门)


软件应用丨Python数据分析,学习路径拆解及资源推荐(附详细思维导图)


机器学习丨经济学研究中的机器学习:回顾与展望





数据Seminar




这里是大数据、分析技术与学术研究的三叉路口





出处:CSDN作者:向前走别回头推荐:青酱排版编辑:青酱 





    欢迎扫描👇二维码添加关注    


    您可能也对以下帖子感兴趣

    文章有问题?点此查看未经处理的缓存