Python对数据实现可视化

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
# 读取flights数据集
flights = pd.read_csv('data/flights.csv')
flights.head()

	MONTH	DAY	WEEKDAY	AIRLINE	ORG_AIR	DEST_AIR	SCHED_DEP	DEP_DELAY	AIR_TIME	DIST	SCHED_ARR	ARR_DELAY
0	1	1	4	WN	LAX	SLC	1625	58.0	94.0	590	1905	65.0
1	1	1	4	UA	DEN	IAD	823	7.0	154.0	1452	1333	-13.0
2	1	1	4	MQ	DFW	VPS	1305	36.0	85.0	641	1453	35.0
3	1	1	4	AA	DFW	DCA	1555	7.0	126.0	1192	1935	-7.0
4	1	1	4	WN	LAX	MCI	1720	48.0	166.0	1363	2225	39.0

# 创建两列，表示延迟和准时
flights['DELAYED'] = flights['ARR_DELAY'].ge(15).astype(int) #大于等于15分钟的是延迟
cols = ['DIVERTED', 'CANCELLED', 'DELAYED']
cols
flights['ON_TIME'] = 1 - flights[cols].any(axis=1) #准时的等于总共的减去取消的，延迟的，转机的
cols.append('ON_TIME')
#clos=['DIVERTED', 'CANCELLED', 'DELAYED', 'ON_TIME']
status = flights[cols].sum()
status
'''
DIVERTED       137
CANCELLED      881
DELAYED      11685
ON_TIME      45789
dtype: int64
'''

# 对类型值和连续值列作图
fig, ax_array = plt.subplots(2, 3, figsize=(18,8)) #rows=2,cols=3
(ax1, ax2, ax3), (ax4, ax5, ax6) = ax_array
fig.suptitle('2015 US Flights - Univariate Summary', size=20)


ac = flights['AIRLINE'].value_counts()
ac.plot(kind='barh', ax=ax1, title ='Airline') #横向条形图

oc = flights['ORG_AIR'].value_counts()
oc.plot(kind='bar', ax=ax2, rot=0, title='Origin City') # Bar：柱状图/条形图

dc = flights['DEST_AIR'].value_counts().head(10)
dc.plot(kind='bar', ax=ax3, rot=0, title='Destination City') # Bar：柱状图/条形图

status.plot(kind='bar', ax=ax4, rot=0, log=True, title='Flight Status')

flights['DIST'].plot(kind='kde', ax=ax5, xlim=(0, 3000),
                  title='Distance KDE')

flights['ARR_DELAY'].plot(kind='hist', ax=ax6,  #直方图
                       title='Arrival Delay', range=(0,200))

# 添加关于年的列，用起飞时间得到小时和分钟
hour = flights['SCHED_DEP'] // 100
minute = flights['SCHED_DEP'] % 100
df_date = flights[['MONTH', 'DAY']].assign(YEAR=2015, HOUR=hour, MINUTE=minute)
df_date.head()

	MONTH	DAY	YEAR	HOUR	MINUTE
0	1	1	2015	16	25
1	1	1	2015	8	23
2	1	1	2015	13	5
3	1	1	2015	15	55
4	1	1	2015	17	20

# 用to_datetime函数，将df_date变为Timestamps对象
flight_dep = pd.to_datetime(df_date)
flight_dep.head()
'''
0   2015-01-01 16:25:00
1   2015-01-01 08:23:00
2   2015-01-01 13:05:00
3   2015-01-01 15:55:00
4   2015-01-01 17:20:00
dtype: datetime64[ns]

'''

# 用flight_dep作为新的行索引，并根据它统计每周的航班数
flights.index = flight_dep
fc = flights.resample('W').size() # Pandas中的resample，重新采样，是对原样本重新处理的一个方法，
#是一个对常规时间序列数据重新采样和频率转换的便捷的方法。
fc.plot(figsize=(12,3), title='Flights per Week', grid=True)

# 如果航班数小于1000，则将其当做缺失值。然后用interpolate方法填补缺失值
fc_miss = fc.where(fc >  1000)
fc_intp = fc_miss.interpolate(limit_direction='both')

ax = fc_intp.plot(color='black', figsize=(16,4))
fc_intp[fc < 500].plot(linewidth=10, grid=True, 
                    color='.8', ax=ax)

ax.annotate(xy=(.8, .55), xytext=(.8, .77), 
         xycoords='axes fraction', s='missing data', 
         ha='center',  size=20, arrowprops=dict())
ax.set_title('Flights per Week (Interpolated Missing Data)')

Text(0.5, 1.0, 'Flights per Week (Interpolated Missing Data)')

# 找到10个有最长平均入境航班航程、最少100航次的机场
flights.groupby('DEST_AIR')['DIST'] \
                .agg(['mean', 'count']) \
                .query('count > 100') \
                .sort_values('mean') \
                .tail(10) \
                .plot(kind='bar', y='mean', legend=False, 
                      rot=0, figsize=(14,4),
                      title='Average Distance per Destination')

# 画出航班时间和航程的散点图
fs = flights.reset_index(drop=True)[['DIST', 'AIR_TIME']].query('DIST <= 2000').dropna()
fs.plot(x='DIST', y='AIR_TIME', kind='scatter', s=1, figsize=(16,4))

<matplotlib.axes._subplots.AxesSubplot at 0x260735471c0>

# 用cut函数，将航班距离分成八组
fs['DIST_GROUP'] = pd.cut(fs['DIST'], bins=range(0, 2001, 250))
fs['DIST_GROUP'].value_counts().sort_index()
'''
(0, 250]         6529
(250, 500]      12631
(500, 750]      11506
(750, 1000]      8832
(1000, 1250]     5071
(1250, 1500]     3198
(1500, 1750]     3885
(1750, 2000]     1815
Name: DIST_GROUP, dtype: int64

'''

# 计算每组的标准差
normalize = lambda x: (x - x.mean()) / x.std()
fs['TIME_SCORE'] = fs.groupby('DIST_GROUP')['AIR_TIME'] \
                              .transform(normalize)
fs.head()

	DIST	AIR_TIME	DIST_GROUP	TIME_SCORE
0	590	94.0	(500, 750]	0.490966
1	1452	154.0	(1250, 1500]	-1.267551
2	641	85.0	(500, 750]	-0.296749
3	1192	126.0	(1000, 1250]	-1.211020
4	1363	166.0	(1250, 1500]	-0.521999

# 用boxplot方法画出异常值
ax = fs.boxplot(by='DIST_GROUP', column='TIME_SCORE', figsize=(16,4))
ax.set_title('Z-Scores for Distance Groups')
ax.figure.suptitle('')

# 检查超出6个标准偏差的点。用一个DataFrame记录异常点。
outliers = flights.iloc[fs[fs['TIME_SCORE'] > 6].index]
outliers = outliers[['AIRLINE','ORG_AIR', 'DEST_AIR', 'AIR_TIME', 
                  'DIST', 'ARR_DELAY', 'DIVERTED']]
outliers['PLOT_NUM'] = range(1, len(outliers) + 1)
outliers

	AIRLINE	ORG_AIR	DEST_AIR	AIR_TIME	DIST	ARR_DELAY	PLOT_NUM
2015-04-08 09:40:00	DL	ATL	CVG	121.0	373	54.0	1
2015-05-25 16:30:00	F9	MSP	ATL	199.0	907	79.0	2
2015-09-10 20:00:00	UA	IAH	MCI	176.0	643	76.0	3
2015-12-10 19:53:00	OO	PHX	SFO	164.0	651	146.0	4
2015-12-26 09:15:00	NK	ORD	DFW	210.0	802	98.0	5

# 可以这张表的数据确定异常值。pandas提供了将表格附加于图片底部的方法。
ax = fs.plot(x='DIST', y='AIR_TIME', 
                      kind='scatter', s=1, 
                      figsize=(16,4), table=outliers)

outliers.plot(x='DIST', y='AIR_TIME',
                       kind='scatter', s=25, ax=ax, grid=True)


outs = outliers[['AIR_TIME', 'DIST', 'PLOT_NUM']]
for t, d, n  in outs.itertuples(index=False):
    ax.text(d + 5, t + 5, str(n))
    
plt.setp(ax.get_xticklabels(), y=.1)
plt.setp(ax.get_xticklines(), visible=False)
ax.set_xlabel('')
ax.set_title('Flight Time vs Distance with Outliers')

猜你喜欢

目录

热门文章