pandas速度复习

#第一课基础series,dataframe
import pandas as pd
import numpy as np
s1=pd.Series([4,7,-5,3])
print(s1)#索引为默认值
print(s1.values)
print(s1.index)
s2=pd.Series([4.0,6.5,-0.5,4.2],index=['d','b','a','c'])
print(s2['a'])#索引为自设值
print(s2[['a','d']])
print('b' in s2)
#Series是一个定长的有序字典
dic1={'apple':5,'pen':3,'applepen':10}
s3=pd.Series(dic1)
print(s3)#就像列表与元组的关系

data={'year':[2014,2015,2016,2017],
      'income':[10000,30000,50000,80000],
      'pay':[5000,20000,30000,30000]
}
df1=pd.DataFrame(data)
print(df1)
df2=pd.DataFrame(np.arange(12).reshape((3,4)))
print(df2)
df3=pd.DataFrame(np.arange(12).reshape((3,4)),index=['a','c','b'],columns=[2,33,44,5])
print(df3)
print(df1.columns)#列
print(df1.index)#行
print(df1.values)#值
print(df1.describe())
#样本数量,均值,标准差,最小值--最大值
print(df1.T)
df3.sort_index(axis=1)
print(df3)

#第二课选择数据
dates=pd.date_range('20170101',periods=6)
df1=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d'])
print(df1)
print(df1['a'])#读取行
print(df1.a)
print(df1[0:2])#读取列
#通过标签选
print(df1.loc['20170101',['a','c']])#按标签读取
print(df1.loc[:,['a','b']])#全部行&AB两列
#通过位置选
print(df1.iloc[2])#第二行
print(df1.iloc[[1,2,4],[1,3]])#行列位置选
#通过位置与标签混合选(下面这句执行时会提示ix已失效)
#print(df1.ix['20170101':'20170104',2:4])

#第三课赋值及操作
dates=np.arange(20170101,20170107)
df1=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d'])
df1.iloc[2,2]=100
print(df1)
df1[df1.a>10]=0#找到a列中值大于10的行全部清0
print(df1)
df1.a[df1.a==0]=1#找到a列中值等于0的值置1
df1['e']=10#加一列
print(df1)
df1.loc[20170107,['a','b','c']]=[1,2,3]
print(df1)
df1.insert(1,'g',df1['c'])
print(df1)
df2=df1.drop([20170101],axis=0)#行
print(df2)

#第四课处理丢失数据
dates=np.arange(20170101,20170105)
df1=pd.DataFrame(np.arange(12).reshape((4,3)),index=dates,columns=['a','b','c'])
print(df1)
df2=pd.DataFrame(df1,index=dates,columns=['a','b','c','d','e'])
print(df2)
#下面是补充
s1=pd.Series([3,4,6],index=dates[:3])
s2=pd.Series([32,5,2],index=dates[1:])
df2['d']=s1
df2['e']=s2
print(df2)
#下面是删去空行,axis=0是行1是列
df2.dropna(axis=0,how='all')#how取any只要有一个空就删行,all整行空才删
print(df2)
#空值全部赋为val
df2.fillna(value=0)
print(df2)
print(df2.isnull())
#矩阵有一个空值还是全为空值
np.any(df2.isnull())
np.all(df2.isnull())

#第五课读写文件
#打开文件
file=pd.read_csv('people.csv',encoding='gbk')
print(file)
#更新操作
file.iloc[2,0]='深圳'
print(file)
#第六课合并
#concat
import pandas as pd
import numpy as np
df1=pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','d'])
df2=pd.DataFrame(np.arange(12,24).reshape((3,4)),columns=['a','b','c','d'])
df3=pd.DataFrame(np.arange(24,36).reshape((3,4)),columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)
df4=pd.concat([df1,df2,df3],axis=0)
print(df4)
df4=pd.concat([df1,df2,df3],axis=0,ignore_index=True)#纵向合并不考虑原来index
print(df4)
df5=pd.concat([df1,df2,df3],axis=1)#横向合并
print(df5)
df1=pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','f'])
df2=pd.DataFrame(np.arange(12,24).reshape((3,4)),columns=['a','b','c','d'])
print(df1)
print(df2)
df6=pd.concat([df1,df2],join='outer',ignore_index=True,sort=True)
print(df6)#合并两个表,空的部分填NAN
df7=pd.concat([df1,df2],join='inner',ignore_index=True,sort=True)
print(df7)
#merge
left=pd.DataFrame({'key':['k0','k1','k2','k3'],
                   'a':['a0','a1','a2','a3'],
                   'b':['b0','b1','b2','b3']})
right=pd.DataFrame({'key':['k0','k1','k2','k3'],
                   'c':['c0','c1','c2','c3'],
                   'd':['d0','d1','d2','d3']})
print(left)
print(right)
res=pd.merge(left,right,on='key')
print(res)
#第七课plot
import matplotlib.pyplot as plt
#生成1000个随机数,下标从0到999
data=pd.Series(np.random.randn(1000),index=np.arange(1000))
data=data.cumsum()#把值累加起来(无错,就是前缀和)
data.plot()#累加到正的值越来越大,负的越来越小
plt.show()
#生成四组随机数(因为后面的下标是ABCD四列
data=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=['a','b','c','d'])
data=data.cumsum()
print(data.head())
data.plot()
plt.show()












猜你喜欢

转载自blog.csdn.net/cj1064789374/article/details/88177221
今日推荐