Pandas教程（一）

参考资料：清华计算机博士带你学-Python金融量化分析

demo1-Series使用特性

demo2-Series整数索引问题

demo8-DataFrame数据对齐与缺失数据处理

demo1-Series使用特性

#Pandas-数据分析核心包
import pandas as pd
import numpy as np

#demo1-Series使用特性
#数组与字典结合体

print("*****Series支持数组的特性：索引*****")
sr = pd.Series([2,3,4,5],index=["a","b","c","d"]) #array_like创建Series
print(sr)
print(sr[0])   #按下标索引
print(sr["a"]) #按标签索引
print(sr+2)    #与标量运算
print(sr+2*sr) #两个Series运算
print(sr[:2])  #切片
print(sr[[1,3]]) #花式索引
print(np.sqrt(sr)) #通用函数
print(sr[sr>3])    #布尔型索引

print("*****Series支持字典的特性：标签*****")
sr = pd.Series({"a":1,"b":2}) #使用字典创建Series
print(sr)
print(sr["a"],sr["b"])
print(sr[["a","b"]]) #花式索引
print("a" in sr,"c" in sr) #in运算

for each in sr:
    print(each)  #与内置dict不同，打印的是value
    
print(sr.index,sr.index[0])
print(sr.values,sr.values[1])

*****Series支持数组的特性：下标*****
a    2
b    3
c    4
d    5
dtype: int64
2
2
a    4
b    5
c    6
d    7
dtype: int64
a     6
b     9
c    12
d    15
dtype: int64
a    2
b    3
dtype: int64
b    3
d    5
dtype: int64
a    1.414214
b    1.732051
c    2.000000
d    2.236068
dtype: float64
c    4
d    5
dtype: int64
*****Series支持字典的特性：标签*****
a    1
b    2
dtype: int64
1 2
a    1
b    2
dtype: int64
True False
1
2
Index(['a', 'b'], dtype='object') a
[1 2] 2

demo2-Series整数索引问题

#demo2-Series整数索引问题
#iloc与loc
sr = pd.Series(np.arange(20))
srNew = sr[10:].copy()
print(srNew,sr[10])  #默认是标签
print(srNew.iloc[9]) #按索引
print(srNew.loc[10]) #按标签

10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
dtype: int32 10
19
10

demo3-Series数据对齐

print("*****sr1与sr2等长*****")
sr1 = pd.Series([12,23,34],index=["c","a","d"])
sr2 = pd.Series([11,20,10],index=["d","c","a"])
print(sr1+sr2) #按标签index相加

print("*****sr1与sr2不等长*****")
sr1 = pd.Series([12,23,34],index=["c","a","d"])
sr2 = pd.Series([11,20,10,21],index=["d","c","a","b"])
print(sr1+sr2) #按标签index相加

print("*****sr1与sr2不等长:fill_value*****")
sr1 = pd.Series([12,23,34],index=["c","a","d"])
sr2 = pd.Series([11,20,10,21],index=["d","c","a","b"])
print(sr1.add(sr2,fill_value=0)) #注意观察"b"标签所对应的值

*****sr1与sr2等长*****
a    33
c    32
d    45
dtype: int64
*****sr1与sr2不等长*****
a    33.0
b     NaN
c    32.0
d    45.0
dtype: float64
*****sr1与sr2不等长:fill_value*****
a    33.0
b    21.0
c    32.0
d    45.0
dtype: float64

demo4-Series缺失值的处理

#demo4-Series缺失值的处理

sr1 = pd.Series([12,23,34],index=["c","a","d"])
sr2 = pd.Series([11,20,10,21],index=["d","c","a","b"])
sr3 = sr1 + sr2

print("*****删除缺失值的3种方法*****")
print(sr3)
print(sr3.isnull())
print(sr3.notnull())
print(sr3[~sr3.isnull()])  #方法一
print(sr3[sr3.notnull()])  #方法二
print(sr3.dropna())        #方法三

print(sr3)   #不在原始数据上改

print("*****填充缺失值*****")
print(sr3.fillna(0))  #填充9
print(sr3.fillna(sr3.mean())) #填充平均值

*****删除缺失值的3种方法*****
a    33.0
b     NaN
c    32.0
d    45.0
dtype: float64
a    False
b     True
c    False
d    False
dtype: bool
a     True
b    False
c     True
d     True
dtype: bool
a    33.0
c    32.0
d    45.0
dtype: float64
a    33.0
c    32.0
d    45.0
dtype: float64
a    33.0
c    32.0
d    45.0
dtype: float64
a    33.0
b     NaN
c    32.0
d    45.0
dtype: float64
*****填充缺失值*****
a    33.0
b     0.0
c    32.0
d    45.0
dtype: float64
a    33.000000
b    36.666667
c    32.000000
d    45.000000
dtype: float64

#Series小结
#字典与数字的结合体：下标索引 + 标签访问
#整数索引loc与iloc
#数据对齐/ sr1.add(sr2,fill_value=0)
#缺失数据处理：①dropna ②fillna(0)

demo5-DataFrame的创建

#demo5-DataFrame的创建
#表格型数据结构 ，可以看作由Series组成的字典，共用一个索引
print(pd.DataFrame({"one":[1,2,3],"two":[4,5,6]},index=["a","b","c"]))  #统一指定index

#一列为同一个类型，由于存在nan，因此one列变为浮点型    采用Series分别指定index
print(pd.DataFrame({"one":pd.Series([1,2,3],index=["a","b","c"]),"two":pd.Series([2,1,3,4],index=["a","b","c","d"])}))


#pd.read_csv() #读取文件
#df.to_csv()   #保存文件

data = pd.read_csv("600519.csv",index_col="trade_date")
data

  one  two
a    1    4
b    2    5
c    3    6
   one  two
a  1.0    2
b  2.0    1
c  3.0    3
d  NaN    4

demo6-DataFrame常见属性

#demo6-DataFrame常见属性

#index    标签
#T        转置
#columns  列名
#values   数值值
#describe 快速统计


df = pd.DataFrame({"one":[1,2,3],"two":[4,5,6]},index=["a","b","c"])
print(df)
print(df.index)   #获取标签
print(df.T)       #转置
print(df.columns) #列名/列索引
print(df.values,type(df.values),df.values.shape)  #获取数组值->返回二维数组
print(df.describe())

one  two
a    1    4
b    2    5
c    3    6
Index(['a', 'b', 'c'], dtype='object')
     a  b  c
one  1  2  3
two  4  5  6
Index(['one', 'two'], dtype='object')
[[1 4]
 [2 5]
 [3 6]] <class 'numpy.ndarray'> (3, 2)
       one  two
count  3.0  3.0
mean   2.0  5.0
std    1.0  1.0
min    1.0  4.0
25%    1.5  4.5
50%    2.0  5.0
75%    2.5  5.5
max    3.0  6.0

demo7-DataFrame索引与切片

#demo7-DataFrame索引与切片

#使用loc或者iloc访问
#{行,列]

df = pd.DataFrame({"one":pd.Series([1,2,3],index=["a","b","c"]),"two":pd.Series([2,1,3,4],index=["a","b","c","d"])})
print(df)
print(df["one"]["a"])  #先列后行：不推荐连续使用两个中括号

print("*****取某一个元素*****")
print(df.loc["a","one"]) #取某一个

print("*****取某一列元素*****")
print(df["one"],type(df["one"]))  #看一列数据->Series

print("*****取某一行元素*****")
print(df.iloc[0])    #看一行数据
print(df.loc["a",:]) #看一行数据，:切片表示所有
print(df.loc["a",])  #看一行数据  ，后省略也是默认所有
print(df.loc["a"])   #看一行数据->简写

print("*****根据需求任意取元素*****")
print(df.loc[["a","d"],:]) #常规索引 切片 布尔值索引 花式索引可以任意搭配

   one  two
a  1.0    2
b  2.0    1
c  3.0    3
d  NaN    4
1.0
*****取某一个元素*****
1.0
*****取某一列元素*****
a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64 <class 'pandas.core.series.Series'>
*****取某一行元素*****
one    1.0
two    2.0
Name: a, dtype: float64
one    1.0
two    2.0
Name: a, dtype: float64
one    1.0
two    2.0
Name: a, dtype: float64
one    1.0
two    2.0
Name: a, dtype: float64
*****根据需求任意取元素*****
   one  two
a  1.0    2
d  NaN    4

demo8-DataFrame数据对齐与缺失数据处理

#demo8-DataFrame数据对齐与缺失数据处理

df1 = pd.DataFrame({"two":[1,2,3,4],"one":[4,5,6,7]},index=list("cdba"))
df2 = pd.DataFrame({"one":pd.Series([1,2,3],index=["a","b","c"]),"two":pd.Series([2,1,3,4],index=["a","b","c","d"])})

print("*****DataFrame数据对齐*****")
print(df1)
print(df2)
print(df1+df2)  #行索引与列索引对齐相加

#dropna(axis=0,where='any')
#fillna(value)
#isnull
#notnull
df2.loc["d","two"] = np.nan
df2.loc["c","two"] = np.nan
print(df2)
print("*****fillna()*****")
print(df2.fillna(999)) #填充值

print("*****dropna:默认参数axis=0为行,how='any'即任意有nan即删除*****")
print(df2.dropna()) #有一个缺失值，就删除一整行 默认how是any即任何一个,默认axis=0是行
print(df2.dropna(how="any",axis=0))
print(df2.dropna(how="all")) #所有都是nan才删除行

print(df2.dropna(axis=1))#axis=是列

*****DataFrame数据对齐*****
   two  one
c    1    4
d    2    5
b    3    6
a    4    7
   one  two
a  1.0    2
b  2.0    1
c  3.0    3
d  NaN    4
   one  two
a  8.0    6
b  8.0    4
c  7.0    4
d  NaN    6
   one  two
a  1.0  2.0
b  2.0  1.0
c  3.0  NaN
d  NaN  NaN
*****fillna()*****
     one    two
a    1.0    2.0
b    2.0    1.0
c    3.0  999.0
d  999.0  999.0
*****dropna*****
   one  two
a  1.0  2.0
b  2.0  1.0
   one  two
a  1.0  2.0
b  2.0  1.0
   one  two
a  1.0  2.0
b  2.0  1.0
c  3.0  NaN
Empty DataFrame
Columns: []
Index: [a, b, c, d]

demo9-Pandas常用函数

#demo9-Pandas常用函数
#mean(axis=0,skipna=True)
#sum(axis=1)
#sort_index(axis,...,ascending) 
#sort_values(by,axis,ascending)
#Numpy的函数同样适用于Pandas

#axis=0表示跨行，axis=1表示跨列/通用函数与dropna联合理解
#推荐资料：https://zhuanlan.zhihu.com/p/110105054

print(df)
print("*****mean*****")
print(df.mean())       #按列求平均 (1+2+3)/3=2
print(df.mean(axis=0)) #axis-跨行/默认按列
print(df.mean(skipna=True))
print(df.mean(axis=1)) #按行求平均


print("*****sum*****")
print(df.sum())  #默认按列
print(df.sum(axis=1))

print("*****sort_values*****") #nan不参与排序，无论升序或者降序均放在最后面
print(df)
print(df.sort_values(by="two")) #by:按某列排序
print(df.sort_values(by="two",ascending=False)) #ascending意为上升，为False则为降序
print(df.sort_values(axis=1,by="b"))

print("*****sort_index*****")
dfNew = pd.DataFrame({"two":[1,2,3,4],"one":[4,5,6,7]},index=list("cdba"))
print(dfNew)
print(dfNew.sort_index())
print(dfNew.sort_index(ascending=False))
print(dfNew.sort_index(axis=1))

one  two
a  1.0    2
b  2.0    1
c  3.0    3
d  NaN    4
*****mean*****
one    2.0
two    2.5
dtype: float64
one    2.0
two    2.5
dtype: float64
one    2.0
two    2.5
dtype: float64
a    1.5
b    1.5
c    3.0
d    4.0
dtype: float64
*****sum*****
one     6.0
two    10.0
dtype: float64
a    3.0
b    3.0
c    6.0
d    4.0
dtype: float64
*****sort_values*****
   one  two
a  1.0    2
b  2.0    1
c  3.0    3
d  NaN    4
   one  two
b  2.0    1
a  1.0    2
c  3.0    3
d  NaN    4
   one  two
d  NaN    4
c  3.0    3
a  1.0    2
b  2.0    1
   two  one
a    2  1.0
b    1  2.0
c    3  3.0
d    4  NaN
*****sort_index*****
   two  one
c    1    4
d    2    5
b    3    6
a    4    7
   two  one
a    4    7
b    3    6
c    1    4
d    2    5
   two  one
d    2    5
c    1    4
b    3    6
a    4    7
   one  two
c    4    1
d    5    2
b    6    3
a    7    4

demo10-时间对象

#demo10-时间对象

print("*****pd.to_datetime()*****")
print(pd.to_datetime(["2021-7-20","1996.10.04","1997-MAY-20"]))
print(type(pd.to_datetime(["2021-7-20","1996.10.04","1997-MAY-20"]))) #类型为DatetimeIndex通常用于做索引
print(type(pd.to_datetime(["2021-7-20","1996.10.04","1997-MAY-20"])[0]))

print("*****生成一定时间范围的时间对象*****")
print(pd.date_range("2010.1.1","2010.5.1")) #指定start与end
print(pd.date_range("2010.1.1",periods=30)) #period长度 freq默认是D天 W周
print(pd.date_range("2010.1.1",periods=30,freq="h")) #freq为小时
print(pd.date_range("2010.1.1",periods=30,freq="1h30min")) #freq为小时
print(pd.date_range("2022.9.1",periods=30,freq="B")) #B为工作日
print(type(pd.date_range("2022.9.1",periods=30,freq="B")))
print(type(pd.date_range("2022.9.1",periods=30,freq="B")[0])) #类型为时间戳

*****pd.to_datetime()*****
DatetimeIndex(['2021-07-20', '1996-10-04', '1997-05-20'], dtype='datetime64[ns]', freq=None)
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
*****生成一定时间范围的时间对象*****
DatetimeIndex(['2010-01-01', '2010-01-02', '2010-01-03', '2010-01-04',
               '2010-01-05', '2010-01-06', '2010-01-07', '2010-01-08',
               '2010-01-09', '2010-01-10',
               ...
               '2010-04-22', '2010-04-23', '2010-04-24', '2010-04-25',
               '2010-04-26', '2010-04-27', '2010-04-28', '2010-04-29',
               '2010-04-30', '2010-05-01'],
              dtype='datetime64[ns]', length=121, freq='D')
DatetimeIndex(['2010-01-01', '2010-01-02', '2010-01-03', '2010-01-04',
               '2010-01-05', '2010-01-06', '2010-01-07', '2010-01-08',
               '2010-01-09', '2010-01-10', '2010-01-11', '2010-01-12',
               '2010-01-13', '2010-01-14', '2010-01-15', '2010-01-16',
               '2010-01-17', '2010-01-18', '2010-01-19', '2010-01-20',
               '2010-01-21', '2010-01-22', '2010-01-23', '2010-01-24',
               '2010-01-25', '2010-01-26', '2010-01-27', '2010-01-28',
               '2010-01-29', '2010-01-30'],
              dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2010-01-01 00:00:00', '2010-01-01 01:00:00',
               '2010-01-01 02:00:00', '2010-01-01 03:00:00',
               '2010-01-01 04:00:00', '2010-01-01 05:00:00',
               '2010-01-01 06:00:00', '2010-01-01 07:00:00',
               '2010-01-01 08:00:00', '2010-01-01 09:00:00',
               '2010-01-01 10:00:00', '2010-01-01 11:00:00',
               '2010-01-01 12:00:00', '2010-01-01 13:00:00',
               '2010-01-01 14:00:00', '2010-01-01 15:00:00',
               '2010-01-01 16:00:00', '2010-01-01 17:00:00',
               '2010-01-01 18:00:00', '2010-01-01 19:00:00',
               '2010-01-01 20:00:00', '2010-01-01 21:00:00',
               '2010-01-01 22:00:00', '2010-01-01 23:00:00',
               '2010-01-02 00:00:00', '2010-01-02 01:00:00',
               '2010-01-02 02:00:00', '2010-01-02 03:00:00',
               '2010-01-02 04:00:00', '2010-01-02 05:00:00'],
              dtype='datetime64[ns]', freq='H')
DatetimeIndex(['2010-01-01 00:00:00', '2010-01-01 01:30:00',
               '2010-01-01 03:00:00', '2010-01-01 04:30:00',
               '2010-01-01 06:00:00', '2010-01-01 07:30:00',
               '2010-01-01 09:00:00', '2010-01-01 10:30:00',
               '2010-01-01 12:00:00', '2010-01-01 13:30:00',
               '2010-01-01 15:00:00', '2010-01-01 16:30:00',
               '2010-01-01 18:00:00', '2010-01-01 19:30:00',
               '2010-01-01 21:00:00', '2010-01-01 22:30:00',
               '2010-01-02 00:00:00', '2010-01-02 01:30:00',
               '2010-01-02 03:00:00', '2010-01-02 04:30:00',
               '2010-01-02 06:00:00', '2010-01-02 07:30:00',
               '2010-01-02 09:00:00', '2010-01-02 10:30:00',
               '2010-01-02 12:00:00', '2010-01-02 13:30:00',
               '2010-01-02 15:00:00', '2010-01-02 16:30:00',
               '2010-01-02 18:00:00', '2010-01-02 19:30:00'],
              dtype='datetime64[ns]', freq='90T')
DatetimeIndex(['2022-09-01', '2022-09-02', '2022-09-05', '2022-09-06',
               '2022-09-07', '2022-09-08', '2022-09-09', '2022-09-12',
               '2022-09-13', '2022-09-14', '2022-09-15', '2022-09-16',
               '2022-09-19', '2022-09-20', '2022-09-21', '2022-09-22',
               '2022-09-23', '2022-09-26', '2022-09-27', '2022-09-28',
               '2022-09-29', '2022-09-30', '2022-10-03', '2022-10-04',
               '2022-10-05', '2022-10-06', '2022-10-07', '2022-10-10',
               '2022-10-11', '2022-10-12'],
              dtype='datetime64[ns]', freq='B')
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>

demo11-时间序列

#demo11-时间序列

print("*****时间对象索引*****")
dateIndex = pd.date_range("2022.9.1",periods=66)
sr = pd.Series(np.arange(66),index=dateIndex)
print(sr.index) #索引为时间对象

print(sr["2022-9-25":]) #从2022.9.25开始切片
print(sr["2022-10":])   #从2022-10开始
print(sr["2022-10"])    #只选择2022-10月

print("*****resample重采样*****")
print(sr.resample("w").sum())
print(sr.resample("w-mon").sum())
print(sr.resample("m").mean())

*****时间对象索引*****
DatetimeIndex(['2022-09-01', '2022-09-02', '2022-09-03', '2022-09-04',
               '2022-09-05', '2022-09-06', '2022-09-07', '2022-09-08',
               '2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12',
               '2022-09-13', '2022-09-14', '2022-09-15', '2022-09-16',
               '2022-09-17', '2022-09-18', '2022-09-19', '2022-09-20',
               '2022-09-21', '2022-09-22', '2022-09-23', '2022-09-24',
               '2022-09-25', '2022-09-26', '2022-09-27', '2022-09-28',
               '2022-09-29', '2022-09-30', '2022-10-01', '2022-10-02',
               '2022-10-03', '2022-10-04', '2022-10-05', '2022-10-06',
               '2022-10-07', '2022-10-08', '2022-10-09', '2022-10-10',
               '2022-10-11', '2022-10-12', '2022-10-13', '2022-10-14',
               '2022-10-15', '2022-10-16', '2022-10-17', '2022-10-18',
               '2022-10-19', '2022-10-20', '2022-10-21', '2022-10-22',
               '2022-10-23', '2022-10-24', '2022-10-25', '2022-10-26',
               '2022-10-27', '2022-10-28', '2022-10-29', '2022-10-30',
               '2022-10-31', '2022-11-01', '2022-11-02', '2022-11-03',
               '2022-11-04', '2022-11-05'],
              dtype='datetime64[ns]', freq='D')
2022-09-25    24
2022-09-26    25
2022-09-27    26
2022-09-28    27
2022-09-29    28
2022-09-30    29
2022-10-01    30
2022-10-02    31
2022-10-03    32
2022-10-04    33
2022-10-05    34
2022-10-06    35
2022-10-07    36
2022-10-08    37
2022-10-09    38
2022-10-10    39
2022-10-11    40
2022-10-12    41
2022-10-13    42
2022-10-14    43
2022-10-15    44
2022-10-16    45
2022-10-17    46
2022-10-18    47
2022-10-19    48
2022-10-20    49
2022-10-21    50
2022-10-22    51
2022-10-23    52
2022-10-24    53
2022-10-25    54
2022-10-26    55
2022-10-27    56
2022-10-28    57
2022-10-29    58
2022-10-30    59
2022-10-31    60
2022-11-01    61
2022-11-02    62
2022-11-03    63
2022-11-04    64
2022-11-05    65
Freq: D, dtype: int32
2022-10-01    30
2022-10-02    31
2022-10-03    32
2022-10-04    33
2022-10-05    34
2022-10-06    35
2022-10-07    36
2022-10-08    37
2022-10-09    38
2022-10-10    39
2022-10-11    40
2022-10-12    41
2022-10-13    42
2022-10-14    43
2022-10-15    44
2022-10-16    45
2022-10-17    46
2022-10-18    47
2022-10-19    48
2022-10-20    49
2022-10-21    50
2022-10-22    51
2022-10-23    52
2022-10-24    53
2022-10-25    54
2022-10-26    55
2022-10-27    56
2022-10-28    57
2022-10-29    58
2022-10-30    59
2022-10-31    60
2022-11-01    61
2022-11-02    62
2022-11-03    63
2022-11-04    64
2022-11-05    65
Freq: D, dtype: int32
2022-10-01    30
2022-10-02    31
2022-10-03    32
2022-10-04    33
2022-10-05    34
2022-10-06    35
2022-10-07    36
2022-10-08    37
2022-10-09    38
2022-10-10    39
2022-10-11    40
2022-10-12    41
2022-10-13    42
2022-10-14    43
2022-10-15    44
2022-10-16    45
2022-10-17    46
2022-10-18    47
2022-10-19    48
2022-10-20    49
2022-10-21    50
2022-10-22    51
2022-10-23    52
2022-10-24    53
2022-10-25    54
2022-10-26    55
2022-10-27    56
2022-10-28    57
2022-10-29    58
2022-10-30    59
2022-10-31    60
Freq: D, dtype: int32
*****resample重采样*****
2022-09-04      6
2022-09-11     49
2022-09-18     98
2022-09-25    147
2022-10-02    196
2022-10-09    245
2022-10-16    294
2022-10-23    343
2022-10-30    392
2022-11-06    375
Freq: W-SUN, dtype: int32
2022-09-05     10
2022-09-12     56
2022-09-19    105
2022-09-26    154
2022-10-03    203
2022-10-10    252
2022-10-17    301
2022-10-24    350
2022-10-31    399
2022-11-07    315
Freq: W-MON, dtype: int32
2022-09-30    14.5
2022-10-31    45.0
2022-11-30    63.0
Freq: M, dtype: float64

demo12-Pandas文件操作

#demo12-Pandas文件操作

#csv:分隔符为逗号
#xlsx分隔符为制表符

#read_csv参数：
#index_col 选定某列作为index->可以使用数字或者列名 如index_col=0 或者 index_col="date"
#parse_dates Ture则将所有可以解释为时间对象进行解释 / 也可以为列表，则将特定列解析为时间对象
#header 指定文件无列名-read_csv默认会把第一行解释为列名,因此若文件无列名，则需要使用header指明无列名字
#若header为None可以使用names参数指定列名
#skip_rows 跳过某些行/不常用
#na_values = [] 指定哪些字符串为Nan #因为数据的缺省值表示可能不一样，采用该参数统一为nan方便处理
#data.columns = list("abcdefg") 修改列名


#to_csv函数：
#sep：指定分隔符，默认是,
#na_rep:指定缺失值转换的字符串，默认是空字符串
#header:不输出列名
#index:不输出行索引
#columns：指定输出的列，传入列表

#to_json、to_excel、to_pickle.....等 

data = pd.read_csv("601318.csv",index_col="date")  
print(type(data.index))#此时index并非时间对象

data = pd.read_csv("601318.csv",index_col="date",parse_dates=True)  
print(type(data.index))
data = pd.read_csv("601318.csv",index_col="date",parse_dates=["date"])  
print(type(data.index))
data = pd.read_csv("601318.csv",index_col="date")  

data.columns = list("abcdefg") #
print(data)

data = pd.read_csv("601318.csv",index_col="date",parse_dates=["date"])  
print(data)
print(data.loc["2017-5","close"])  #important

data.to_csv("test.csv",columns=["close","open"],header=False,index=False)

<class 'pandas.core.indexes.base.Index'>
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
               a       b       c       d       e           f       g
date                                                                
2007/3/1       0  21.878  20.473  22.302  20.040  1977633.51  601318
2007/3/2       1  20.565  20.307  20.758  20.075   425048.32  601318
2007/3/5       2  20.119  19.419  20.202  19.047   419196.74  601318
2007/3/6       3  19.253  19.800  20.128  19.143   297727.88  601318
2007/3/7       4  19.817  20.338  20.522  19.651   287463.78  601318
...          ...     ...     ...     ...     ...         ...     ...
2017/12/11  2558  71.200  73.250  73.310  70.820  1139927.00  601318
2017/12/12  2559  73.250  71.210  73.560  71.170   777900.00  601318
2017/12/13  2560  71.210  72.120  72.620  70.200   865117.00  601318
2017/12/14  2561  72.120  71.010  72.160  70.600   676186.00  601318
2017/12/15  2562  70.690  70.380  71.440  70.050   735547.00  601318

[2563 rows x 7 columns]
            Unnamed: 0    open   close    high     low      volume    code
date                                                                      
2007-03-01           0  21.878  20.473  22.302  20.040  1977633.51  601318
2007-03-02           1  20.565  20.307  20.758  20.075   425048.32  601318
2007-03-05           2  20.119  19.419  20.202  19.047   419196.74  601318
2007-03-06           3  19.253  19.800  20.128  19.143   297727.88  601318
2007-03-07           4  19.817  20.338  20.522  19.651   287463.78  601318
...                ...     ...     ...     ...     ...         ...     ...
2017-12-11        2558  71.200  73.250  73.310  70.820  1139927.00  601318
2017-12-12        2559  73.250  71.210  73.560  71.170   777900.00  601318
2017-12-13        2560  71.210  72.120  72.620  70.200   865117.00  601318
2017-12-14        2561  72.120  71.010  72.160  70.600   676186.00  601318
2017-12-15        2562  70.690  70.380  71.440  70.050   735547.00  601318

[2563 rows x 7 columns]
date
2017-05-02    37.167
2017-05-03    37.255
2017-05-04    37.079
2017-05-05    36.530
2017-05-08    37.049
2017-05-09    37.245
2017-05-10    39.059
2017-05-11    38.990
2017-05-12    40.147
2017-05-15    40.098
2017-05-16    40.285
2017-05-17    39.628
2017-05-18    39.824
2017-05-19    40.206
2017-05-22    42.000
2017-05-23    42.324
2017-05-24    42.186
2017-05-25    44.598
2017-05-26    44.294
2017-05-31    44.187
Name: close, dtype: float64

参考资料：清华计算机博士带你学-Python金融量化分析

demo1-Series使用特性

demo2-Series整数索引问题

demo3-Series数据对齐

demo4-Series缺失值的处理

demo5-DataFrame的创建

demo6-DataFrame常见属性

demo7-DataFrame索引与切片

demo8-DataFrame数据对齐与缺失数据处理

demo9-Pandas常用函数

demo10-时间对象

demo11-时间序列

demo12-Pandas文件操作

猜你喜欢