DataFrame数据框
# 巧用复制黏贴
>>> import numpy as np >>> import pandas as pd >>> from pandas import Series, DataFrame >>> import webbrowser >>> link = 'http://www.tiobe.com/tiobe-index/' >>> webbrowser.open(link) # 打开一个网页 然后把要生成的数据复制到剪切板,以下选择了编程语言前10行 True
>>> df = pd.read_clipboard() # 执行这段代码自动生存DataFrame对象 >>> df May 2018 May 2017 Change Programming Language Ratings Change.1 0 1 1 NaN Java 16.380% +1.74% 1 2 2 NaN C 14.000% +7.00% 2 3 3 NaN C++ 7.668% +2.92% 3 4 4 NaN Python 5.192% +1.64% 4 5 5 NaN C# 4.402% +0.95% 5 6 6 NaN Visual Basic .NET 4.124% +0.73% 6 7 9 change PHP 3.321% +0.63% 7 8 7 change JavaScript 2.923% -0.15% 8 9 - change SQL 1.987% +1.99% 9 10 11 change Ruby 1.182% -1.25% >>> type(df) # 查看类型 <class 'pandas.core.frame.DataFrame'>
# 查看DataFrame的内容
>>> df.columns # 查看所有列 Index(['May 2018', 'May 2017', 'Change', 'Programming Language', 'Ratings', 'Change.1'], dtype='object') >>> df.Ratings # 查看Ratings这一列 0 16.380% 1 14.000% 2 7.668% 3 5.192% 4 4.402% 5 4.124% 6 3.321% 7 2.923% 8 1.987% 9 1.182% Name: Ratings, dtype: object
# 某一列的类型为时间序列 >>> type(df['May 2018']) <class 'pandas.core.series.Series'># 从df中提取指定的列
>>> df_new = DataFrame(df, columns = ['May 2018', 'Change.1']) # 从df中提取两列生成新的DataFrame >>> df_new May 2018 Change.1 0 1 +1.74% 1 2 +7.00% 2 3 +2.92% 3 4 +1.64% 4 5 +0.95% 5 6 +0.73% 6 7 +0.63% 7 8 -0.15% 8 9 +1.99% 9 10 -1.25%# 如果列里面多加了不存在的 会自动赋值为空
>>> df_new = DataFrame(df, columns=['May 2018', 'Change', 'Sep 2019']) >>> df_new May 2018 Change Sep 2019 # Sep 2019 这一列是不存在的 0 1 NaN NaN 1 2 NaN NaN 2 3 NaN NaN 3 4 NaN NaN 4 5 NaN NaN 5 6 NaN NaN 6 7 change NaN 7 8 change NaN 8 9 change NaN 9 10 change NaN# 列的赋值
# 对序列进行赋值,使用range函数
>>> df_new['Sep 2019'] = range(0,10) # 赋值 0-9 这10个数给 Sep 2019 这一列 >>> df_new May 2018 Change Sep 2019 0 1 NaN 0 1 2 NaN 1 2 3 NaN 2 3 4 NaN 3 4 5 NaN 4 5 6 NaN 5 6 7 change 6 7 8 change 7 8 9 change 8 9 10 change 9# 使用np下的arange(数组)函数
>>> df_new['Sep 2019'] = np.arange(1,11) >>> df_new May 2018 Change Sep 2019 0 1 NaN 1 1 2 NaN 2 2 3 NaN 3 3 4 NaN 4 4 5 NaN 5 5 6 NaN 6 6 7 change 7 7 8 change 8 8 9 change 9 9 10 change 10# 使用序列修改
>>> df_new['Sep 2019'] = pd.Series(np.arange(2,12)) >>> df_new May 2018 Change Sep 2019 0 1 NaN 2 1 2 NaN 3 2 3 NaN 4 3 4 NaN 5 4 5 NaN 6 5 6 NaN 7 6 7 change 8 7 8 change 9 8 9 change 10 9 10 change 11# 对单数某一列下的某些行进行赋值
>>> df_new['Sep 2019'] = pd.Series([100,200], index=[1,2]) >>> df_new May 2018 Change Sep 2019 0 1 NaN NaN 1 2 NaN 100.0 2 3 NaN 200.0 3 4 NaN NaN 4 5 NaN NaN 5 6 NaN NaN 6 7 change NaN 7 8 change NaN 8 9 change NaN
官网:http://pandas.pydata.org/pandas-docs/version/0.14.1/