数据分析之--Mataplotlib入门

Mataplotlib

Mataplotlib

绘制统计图形
读取图片保存图片以及图片显示

Seaborn

1.辅助的库，可以被pyplot控制
2.辅助绘制更多的图形，更加好看，功能更加强大
3.添加了调色板
4.set_style:white(默认),dark,darkgrid,ticks
5.color_palette():调色板
6.palplot():显示显色

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

plt.plot()   # 画布

# 每组颜色都是RGB
pink_r = sns.color_palette(palette='pink_r',n_colors=7)
sns.palplot(pink_r)

绘制线性图

需要X，Y

# 全闭
x = np.linspace(0，14，100)
for i in range(1,7):
    y = np.sin(x+i*0.5)*(7-i*np.pi)
    plt.plot(x,y)

h = np.array([170,171,169,181,190,162,170])
plt.plot(h)

图片的标题

# 图片放大 放在上面
plt.figure(figsize=(10,6))
x = np.linspace(-10,10,100)
y = np.sin(x)
z = np.cos(x)

# plt.plot 会被覆盖
plt.plot(x,y,lable='sin')   # lable 标签
plt.plot(x,z,lable='cos')

# 图片的标题
# fontsize 代表字体的颜色
# color  字体的颜色
# alpha  透明度
# rotation   字体旋转的角度
plt.title('sin&cos',fontsize=20,color='red',alpha=0.5,rotation=360)

# 设置X轴的标题
plt.xlabel('X=range(-10,10)',fontsize=20,color='orage',alpha=0.5)
# 设置Y轴的标题
plt.ylabel('f(x)=sin(x)&f(x)=cos(x)',fontsize=20,color='orange',alpha=0.5,rotation=90)

#控制X轴的范围
plt.xlim([-10,10])


#控制Y轴的范围
plt.ylim(-1,1)

# 图例
# loc是图例的位置  nloc控制显示的列数
plt.legend(loc=[0,0.1],nloc=1)

点和线的样式

#图片放大
plt.figure(figsize=(10,6))
x = np.linspace(-10,10,100)
y = np.sin(x)
z = np.cos(x)

#线的样式 linestyle  ls
#--  虚线
#-   实线
#:   虚线

#线的宽度 linewidth lw

#线的颜色  color c
#蓝色 b   绿色  g   红色 r   黄色  y   黑色 k
#青色 c   洋红色 m   白色  w
#颜色支持rgb

#点 marker
#o 小圆点   s 放块   d 菱形    x 叉 

#markersize  点的大小

plt.plot(x,y,label='sin',linestyle='--',lw=1,c='#FF0000',marker='o',markersize=10)
plt.plot(x,z,label='cos',ls=':',c=(0,1,0))

#图例
#loc是图例的位置
#ncol是图例显示的列数
plt.legend(loc=[0,1],ncol=2)

X和Y轴可读的映射

matplotlib支持lataX的语法

x = np.linspace(-np.pi,np.pi,100)
y1 = np.sin(x)
y2 = np.cos(x)

#画布对象实例化
#参数有三个 : 行  列  编号(不能从0开始,并且不能重复)
axes = plt.subplot(1,1,1)
#把x,y1花在axes画布中
axes.plot(x,y1,label='sin')
axes.plot(x,y2,label='cos')
axes.legend()
axes.set_title('A')
axes.set_xlabel('X')
axes.set_ylabel('Y')

pi = np.pi
#映射
#设置x轴的标记,协商标记数轴值
axes.set_xticks([-pi,-pi/2,0,pi/2,pi])
axes.set_xticklabels(['$-\pi$','$-\pi/2$',0,'$\pi/2$','$\pi$'],fontsize=20)


axes.set_yticks([-1,0,1])
axes.set_yticklabels(['min',0,'max'],fontsize=20,rotation=20,color='orange')

直方图

1.统计元素出现的次数
2.可以描述分部的状态

a = np.array(list('abcdabcdaa'))
plt.hist(a,bins=20,color='r')

柱状图

from sklearn.datasets import load_iris
data=load_iris().data
target = load_iris().target.reshape(-1,1)

# 合并二维数据  feature_names:特征的名称
iris=pd.DataFrame(np.concatenate([data,target],axis=1),columns=load_iris().feature_names+['labels'])

# 转变数据类型为整型
iris.labels=iris.labels.astype('int8')

iris.labels.map({0:'A',1:'B',2:'C'})

b = np.array([0,1,2,3,0,1,2,3,0,0])
#sns不支持str类型的统计
#sns绘制的图一定是带有密度图的
sns.distplot(b,bins=20,color='r')

# 方式二
c = np.array(['A','B','C'])
iris.labels=c[iris.labels]
iris.columns
Out[.]:Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'labels'],
      dtype='object')

#一列代表一个柱子,表示的是最大值的范围 
sns.barplot(data=iris)

#load_dataset 加载数据集  铁达尼
titanic =  pd.read_csv('./titanic.csv')

#x,y每一列代表什么意思
#hue条件
sns.barplot(x='Sex',y='Survived',hue='Pclass',data=tita
nic)

条件性柱状图

# load_dataset 加载数据集  泰坦尼克号
titanic = pd.read_csv('./titanic.csv')

# x,y代表每列代表什么意思
sns.barplot(x='Sex',y='Survived',data=titanic)

饼图

表示的是一个比例

titanic.head()
male=titanic.query("Sex=='male'").shape[0]
male_d = titanic.query("Sex=='male' & Survived==0").shape[0]
female=titanic.query("Sex=='female'").shape[0]
female_d=titanic.query("Sex=='female' & Survived==0").shape[0]
sns.palplot(sns.color_palette('hls',2)),sns.palplot(sns.color_palette('rainbow',2)),sns.palplot(sns.color_palette('cool',2))

plt.figure(figsize=(8*3,8))
#绘制饼图
#x : 数据部分
#lables:标签部分
#autopct:显示比例  (%%代表百分号)
#explode:分离度
x = np.array([male,female]) 
labels=['male','female']
axes1 = plt.subplot(1,3,1)
axes1.pie(x,labels=labels,autopct="%.2f%%",colors=sns.color_palette('hls',2))
axes1.axis('image')
axes1.legend(['male=%s'%(male),'female=%s'%(female)],loc=[0,1])



#男性死亡比例
x1 = np.array([male-male_d,male_d])
labels=['L','D']
axes2 = plt.subplot(1,3,2)
explode=[.1,0]
axes2.pie(x1,labels=labels,autopct="%.2f%%",explode=explode,colors=sns.color_palette('rainbow',2))
axes2.axis('image')
axes2.legend(['L=%s'%(male-male_d),'D=%s'%(male_d)],loc=[0,1])



#女性死亡比例
x2 = np.array([female-female_d,female_d])
labels=['L','D']
axes3 = plt.subplot(1,3,3)
explode=[.1,0]
axes3.pie(x2,labels=labels,autopct="%.2f%%",explode=explode,colors=sns.color_palette('cool',2))
axes3.axis('image')
axes3.legend(['L=%s'%(female-female_d),'D=%s'%(female_d)],loc=[0,1])


#保存图片
#fname:文件存储的路径
#dpi:像素密集度
#facecolor:背景色

plt.savefig('./pie.png',dpi=100)

箱图

类别型的离散值没有必要绘制箱图
查看范围,查看异常值

titanic.info()
titanic.head()

titanic.loc[:,'Parch'].unique()
Out[.]:array([0, 1, 2, 5, 3, 4, 6])

#筛选所有的数值列
#include=None, 
#exclude

titanic.select_dtypes(exclude=['object']).columns
Out[.]:Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

titanic.Age.plot(kind='box')

sns.boxplot(data=titanic.Age)

散步图

观察量与量之间的分布关系的
二维的图形，一列代表x,一列代表y
c参数是class的意思，只支持数值类型

iris.labels = iris.labels.astype('category').cat.codes

# 散步图使用多的场景：机器学习中的分类
plt.scatter(iris.iloc[:50,0],iris.iloc[:50,1],marker='o',color='r')
plt.scatter(iris.iloc[50:100,0],iris.iloc[50:100,1],marker='d',color='y')
plt.scatter(iris.iloc[100:,0],iris.iloc[100:,1],marker='x',color='g')

plt.scatter(iris.iloc[:,2],iris.iloc[:,3],c=iris.labels,cmap='gray')

散步密度图

直方图

iris.columns
sns.jointplot(x='sepal length (cm)',y='sepal width (cm)',data=iris)

#"scatter" 散布| "reg" 回归| "resid" 普通| "kde" 密度| "hex" 蜂巢
sns.jointplot(x='sepal length (cm)',y='sepal width (cm)',data=iris,kind='hex')

回归散布图

sns.regplot(x='sepal length (cm)',y='sepal width (cm)',data=iris)

#点到线之间的平均距离最短  方程
sns.regplot(x='petal length (cm)',y='petal width (cm)',data=iris)

线性分类散步图

sns.lmplot(x='petal length (cm)',y='petal width (cm)',data=iris,hue='labels',markers=['o','d','x'])

散布图矩阵

检查量于量之间相关性
要求hue的值是str

iris.labels = iris.labels.replace({0:'A',1:'B',2:'C'},)

sns.pairplot(iris,hue='labels', diag_kind='kde')
#第二列中,3个类别值交集太大,不容进行分类

sns.boxplot(data=iris)
#第二列有异常值

#相关系数为1代表两列完全一致   -1负相关
#相关系数为0代表两列完全不一样
plt.scatter(iris.iloc[:,0],iris.iloc[:,0])

3D图

# 3d散步图
from mpl_toolkits.mplot3d.axes3d import Axes3D

#3D散布图
#画布
plt.figure(figsize=(16,12))
axes3d=plt.subplot(projection='3d')
target = iris.labels.astype('category').cat.codes
axes3d.scatter3D(iris.iloc[:,0],iris.iloc[:,2],iris.iloc[:,3],c=target,cmap='rainbow',s=50)

# 3d平面图  plot()
x = np.linspace(0,100,100)
y = np.sin(x)
# z 必须和x,y有关联

#3D平面图  plot()
x = np.linspace(-10,10,100)
y = np.linspace(-10,10,100)

#z 必须和x,y有关联
xx,yy=np.meshgrid(x,y)

#e^?
z = xx**3+yy**3

fig = plt.figure(figsize=(16,10))
axes3d = Axes3D(fig)
#超平面  注意x,y,z都必须是二维的数组
pic=axes3d.plot_surface(xx,yy,z,cmap='rainbow')
#色柱
plt.colorbar(pic,shrink=0.8)

Excel数据导入数据库

import pandas as pd
from sqlalchemy import create_engine

conn = create_engine("mysql+pymysql://hal:123456@localhost:3306/demo")

#读取csv
userinfo = pd.read_csv('./user_info_utf.csv',header=None,names=['userid','sex','birth'])

userinfo.info()

#name  导入的表名
#index=False 不要行号
#if_exists='append' 表存在则添加
#GRANT ALL PRIVILEGES ON *.* TO 'hal'@'%' IDENTIFIED BY '123456' WITH GRANT OPTION;
#MySQL中text类型只能创建FullText key  , 不能创建 Index
userinfo.iloc[:50000].to_sql('user_info',conn,index=False,if_exists='fail')

userinfo.to_sql('user_info',conn,index=False,if_exists='append')

数据分析之--Mataplotlib入门

Mataplotlib

Seaborn

绘制线性图

图片的标题

点和线的样式

X和Y轴可读的映射

直方图

柱状图

条件性柱状图

饼图

箱图

散步图

散步密度图

回归散布图

线性分类散步图

散布图矩阵

3D图

Excel数据导入数据库

猜你喜欢