Numpy

reshape

重塑

import numpy as np
b = np.arange(27).reshape(3, 3, 3)
[[[ 0  1  2]
  [ 3  4  5]
  [ 6  7  8]]

 [[ 9 10 11]
  [12 13 14]
  [15 16 17]]

 [[18 19 20]
  [21 22 23]
  [24 25 26]]]

reshape(行，列，z) 等于-1的话，那么Numpy会根据剩下的维度计算出数组的另外一个shape属性值
a.shape 数组的维度 x,y
a.ndim 数组轴的个数
a.dtype 数组中元素类型的对象

array

a.dtype 数组中元素类型的对象
d = np.array([1,2,3,4,5], dtype=‘float64’)转换类型
np.zeros((3,3,3))创建数值为0的数组（此处是矩阵）
np.ones((3,3,3))创建数值为1的数组
a.unique() #返回数据里唯一的值的array
a.value_counts() #统计各值出现的频率

arange

np.arange(5) #数列
np.arange(5,20, step = 2)
[ 5 7 9 11 13 15 17 19]

linspace

numpy.linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None) #等差数列
num : int, optional #生成的样本数，默认是50。必须是非负
endpoint : bool, optional #如果是真，则一定包括stop，如果为False，一定不会有stop
retstep : bool, optional #If True, return (samples, step), where step is the spacing between samples.
np.logspace(…,base=2) #等比，比值为2

other

索引，切片和迭代

x.astype(np.float64) #强制类型转换
np.sum(x1, axis=0)  #求和，axis如1则列相加
np.cumsum(x1)  #累加
np.log(g)  #对数
np.exp(g)  #指数
np.sqrt(g)  #开方
np.square(x)  #平方
np.maximum(x1, x2) #返回大的元素array
np.where(x1>x2,a,b) #a和b可以是x1或者任何数，第一个参数如if。返回判断为true的元素的序列
np.round(x) #保留两位小数


a
[[0 1 2]
[3 4 5]
[6 7 8]]

a[:, 1]
[1 4 7]

a[1, :]
[3 4 5]

a[a>3]
[4 5 6 7 8]

a.ravel()  将多维数组降位一维
[0 1 2 3 4 5 6 7 8]

a1 = np.array([9,10,11])
a2 = np.vstack((a,a1))
[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]

a2.reshape(2,6)# 不改变原有数组
a2.resize(2,6) # 改变原有数组（多了补0，少了删去）
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]]

a2.transpose() #转置
[[ 0  6]
 [ 1  7]
 [ 2  8]
 [ 3  9]
 [ 4 10]
 [ 5 11]]

np.hsplit(a2,3)  # 垂直切分,分割成3列
[array([[0, 1],
       [6, 7]]), array([[2, 3],
       [8, 9]]), array([[ 4,  5],
       [10, 11]])]

np.eye(5) # 生成对角矩阵(单位矩阵
[[ 1.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.]
 [ 0.  0.  0.  1.  0.]
 [ 0.  0.  0.  0.  1.]]

b
[[[ 0  1  2]
  [ 3  4  5]
  [ 6  7  8]]

 [[ 9 10 11]
  [12 13 14]
  [15 16 17]]

 [[18 19 20]
  [21 22 23]
  [24 25 26]]]
  
 for element in b.flat: #flat数组元素迭代器
	 print(element)
0
1
2
3
...
22
23
24
25
26

矩阵计算

A = np.array([
[1, 2],
[4, 5],
[7, 8]
])
B = np.array([
[4, 4, 2],
[2, 3, 1],
])

print(np.dot(A, B))#相乘
[[ 8 10  4]
 [26 31 13]
 [44 52 22]]

A_=np.linalg.inv(A) #A的逆矩阵

Pandas

API: http://pandas.pydata.org/pandas-docs/stable/whatsnew.html

###Series
Series是pandas系列里的一维数组，它可以包含任何数据类型的标签。

pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)

obj.name 自定义名称
obj.values 数组表示形式
obj.index 索引对象
obj2 = Series([4,7,-5,3], index=['d','b','a','c'])  带有标记索引

read_excel

返回一个DataFrame

pd.read_excel(io, sheetname=0,header=0,skiprows=None,index_col=None,names=None,
                  arse_cols=None,date_parser=None,na_values=None,thousands=None, 
                convert_float=True,has_index_names=None,converters=None,dtype=None,
                true_values=None,false_values=None,engine=None,squeeze=False,**kwds)

io ：excel 路径；
sheetname=[0,n]，若sheetname=None是返回全表。注意：int/string返回的是dataframe，而none和list返回的是dict of dataframe;
header ：指定作为列名的行，默认0，即取第一行，数据为列名行以下的数据；若数据不含列名，则设定 header = None；
skiprows：省略指定行数的数据
skip_footer：省略从尾部数的行数据
.index_col ：指定列为索引列，也可以使用 u’string’.
names：指定列名，传入一个list数据

import pandas as pd
data = pd.read_excel('sz50.xlsx', sheetname=0, index_col='datetime')
print (data.index)
s = data.close #名为close的列

print(s.head(10))#输出前10条，不带参数输出5， .tail()同理

monthly_prices = s.resample('M').last() #给每个月的最后一天抽样  resample('M') 按时间序列遍历
#resample见 https://blog.csdn.net/wangshuang1631/article/details/52314944/
s.resample('M').median()  #返回本月index均值
df.resample('21D', how='mean') #21天均值

def custom_resampler(array_like):
    return array_like[0] #返回本组的第一个值（月初的值）
first_of_month_prices = s.resample('M').apply(custom_resampler) #apply对每一个传入的数据调用自定义函数custom_resampler

data_s= Series.loc[datetime(2017,1,1):datetime(2017,1,10)]
data_r=data_s.resample('D').mean() #resample在日期之间空缺的地方插入天数，index自动补NaN

data_r.head(10).fillna(method='ffill',a) #填写缺失的数据 ffill表示用前一个值填，bfill用后一个填,None用指定值。 a是一个字典{0:10, 1:20, 3:200},表示第一列空值填充10...

data_r.head(10).dropna(axis=0) #直接删除缺失数据的行（列）
data_r.head(10).dropna(how='all') #只有整行数据为空才删除
pd.merge(df1, df2, on='name', how='inner') #内连接合并，根据name列，还有左连接右连接，外连接
df.join(df2) #合并df
pd.concat(df1, df2, axis=0) #轴向连接，将多个对象沿轴堆叠到一起
df.combine_first(df2) #合并，忽略重复数据，都有的话保留df的内容

DataFrame

pandas.DataFrame( data, index, columns, dtype, copy)  #data: 数据格式（ndarray，series，map等）， index: 行标签， columns: 列标签

other (pandas.)

df.sort_index(axis=0) #排序
df.sort_index(by='xx', ascending=False)#对xx列排序， 降序,True升序
DataFrame.drop('x',axis=0) #删除索引为x的行（1则是列）
DataFrame.mean(n=0)  #把n轴向数据求平均，得到轴数据的平均值。默认求x轴
DataFrame.std() #求标准差
df.cov() #协方差
df.corr() #线性相关程度(相关系数) 参数有{'pearson', 'kendall', 'spearman'},默认pearson
DataFrame.describe  #生成描述性统计数据，包括:count,mean,std,min,25%,50%,75%,max
DataFrame.diff(periods=1, axis=0) #periods：移动的幅度，int类型，默认值为1，指相对于前一行；负数指后x行。 axis：移动的轴，0是x，1是y。{0 or ‘index’, 1 or ‘columns’}
DataFrame.pct_change(n=1) #计算增长率,相对前n个的增长率，如n=2则(n2-n1)/n1
DataFrame.rolling(window=30).mean() #前30个数据的均值（30天均线）
pd.date_range('20160101',periods=5) #返回日期series
pd.concat([s_1, s_2], axis=0) #连接两个表. axis=0添加行, =1则增加列
df.rename(columns={'close':'xx'}, inplace = True) #更改close列名为xx

df.iloc[0], 根据index来索引
df.iloc[0:5:2,‘x’]， 索引为'x'。从0到5，间隔2的行数据。
df.loc['a', 3]是-5，根据索引来索引，第三列数据
df.loc['20160102':'20160104']，  范围索引
df.loc[df['x'] > 6]  #布尔索引
如果只传入一个字典，则结果Series/df中的索引就是原字典的键

df[df.isnull().values==True] #输出缺失的列
df.rank(axis=0, ascending=True) #排序,ascending为True升序
DataFrame.stack(level=-1, dropna=True) #将指定级别从列堆叠到索引。也就是将数据的列“旋转”为行。
df.unstack() #合并(重组)两个数据帧。将数据的行“旋转”为列。
df.pivot('a','b') #将堆叠的df的列a,b重组成a（表头），b(表内容)
DataFrame.aggregate（func，axis = 0，* args，** kwargs ） #聚合,同.agg()  func:可以是[sum,min,max,mean,std]等numpy里的函数
df.agg({'stock1' : np.sum,
       'stock2' : np.std,
       'stock3' : np.mean}) #针对不同列用不同算法
df.agg([np.sum, np.mean, np.std]) #对同一列进行多个不同的计算
df.agg({'high':'max','low':'min','close':'last', 'open':'first'}) # 对同一列用不同方法形成自定义命名的列
df.expanding() #同rolling

DataFrame.ewm（com = None，span = None，halflife = None，alpha = None，min_periods = 0，adjust = True，ignore_na = False，axis = 0 ） #指数加权函数 http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.ewm.html

df['x'].value_count()  #对x列的数据进行分组计数
df.idxmax()  #返回列中最大值索引
df.isin(['e'])  #返回包含e的列
df.info()  #查看数据有无缺失以及子数据类型
df.describe()  #展示每组数据的统计信息
pd.qcut(df,10) #将数据分成10份
pd.cnt(df,list) #list是一个列表，将按照list的间隔分成len(list)-1份，通常配合pd.get_dummies()使用，生成哑变量矩阵
pd.crosstab(a,b) #构建交叉表，a是行信息，b列信息，都是Series。 常与df.div()配合使用，求出a与b的相对比例，每组和为1.
df.pivot_table() #透视表  ....更加底层groupby()

Panel

pandas.Panel(data, items, major_axis, minor_axis, dtype, copy) 
# data	数据采取各种形式，如：ndarray，series，map，lists，dict，constant和另一个数据帧(DataFrame)
# items - axis 0，每个项目对应于内部包含的数据帧(DataFrame)。
# major_axis - axis 1，它是每个数据帧(DataFrame)的索引(行)。
# minor_axis - axis 2，它是每个数据帧(DataFrame)的列。


pn.to_frame() #转换成frame
PN.rename(items{'xx':'yy','xxG':'xxx'})
Panel.transpose(*args) #置换尺寸 args ： 三个位置参数：每一个代指{0,1,2，'items'，'major_axis'，'minor_axis'


访问数据
pn['close']
pn.major_xs('2017-04-10')
pn.minor_xs('2017-04-10')

Panel.ix[0:3,-1,''xx] #xx的dataframe的0-2列最后一行

Talib

tb = talib.abstract #先实例化
tb.MA(value,x) # x日均线
# 例子ta.abstract.MA(value, 5) for name, value in PN.iteritems()
ta.abstract.MACD(value) #MACD线，参数类型Series
talib.ATR() #平均真实波动范围

Datetime

datetime(2017, 1, 2, 3, 4, 5, 6) #2017-01-02 03:04:05.000006
datetime.now() #当前时间
datetime.utcnow()  #当前UTC时间
datetime.fromtimestamp(1000000000) #2001-09-09 09:46:40
datetime.now().strftime(format="%Y-%m-%d %H:%M:%S.%f") # 通过给定的format将date_string转换成datetime 2018-09-16 17:32:07.261732
#格式：https://docs.python.org/2/library/datetime.html#strftime-strptime-behavior

datetime进行运算，返回timedelta
delta=timedelta(days=1, seconds=-2,microseconds=-3, minutes=-4, hours=-5, weeks=6)
delta.days, delta.seconds, delta.microseconds #42 68157 999997

now = datetime.now()
td = timedelta(1)
print (now)
print (td)
print (now + td * 2)
2017-11-21 20:58:50.763878
1 day, 0:00:00
2017-11-23 20:58:50.763878

Matplotlib

pyplot

plt.plot(x,y,format_string,**kwargs, kind='line') #绘制线图 #xy可传入一个dataframe代替，format_string设置线的类型颜色。kind={bar:纵向柱状图, barh:横线柱状图, kdf:概率密度曲线, pie:饼图}
plt.hist(df, bins=20) #绘制直方图 bin指定柱的个数
plt.scatter(stock1.close, stock2.close, c = ['c','r'], s = 20) #散点图 c设置颜色， s设置点大小

plt.title("Stock Prices")
plt.ylabel("Price")
plt.xlabel("Date")
plt.show() 
plt.hlines(0, xmin,xmax, linestyles='dashed') #绘制一条水平参考线, 0代表y=0
plt.axvline() #竖直参考线
plt.figure(figsize = None) #figsize:(15,7) 生成15*7英寸的图像
plt.legend(['x2','x1']) #合并x1和x2 df在同一个图上
plt.fill_bewteen(x, y1, y2, alpha=0.5, color="r") #在y1-y2的竖线之间填充透明度为50%的红色


subplot绘制子图
plt.subplot(2,1,1) #分成2x1，选中第一个，即第一行第一列的子图 此时plt.plot(stock1.close)加进数据
plt.subplot(2,1,2) #分成2x1，占用第二个，即第一行第一列的子图

plt.grid(True) # 显示背景的网格线，还有样式可选
pyplot.axis(v) #v = [xmin, xmax, ymin, ymax] 画布大小


a1.plot()
a2 = a1.twinx() #生成a1画布的反向y轴（右侧y轴）
a2.plot()

题外

pd.rolling_std() * np.sqrt(array)#股票收益波动

import statsmodels.api as sm #statsmodels实现线性拟合  OLS
from statsmodels import regression
y 
x = np.arange(0,8)
x = sm.add_constant(x) #在x左边加上 值全为1的列
model = regression.linear_model.LOS(y, x).fit() #regression.linear_model只是调用其里面的函数LOS。y和x都是array。fit()拟合
b = model.params[0]
k = model.params[1]
y_fit = k*x+b #线性函数
p
np.polynomial.Chebyshev.fit(x,y,p) #多项式拟合。x，y是array，p拟合次数

参考：http://www.fxdayu.com/static/standalone/tutorials.html?article=241

Python Quant