import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# 读取flights数据集
flights = pd.read_csv('data/flights.csv')
flights.head()
|
MONTH |
DAY |
WEEKDAY |
AIRLINE |
ORG_AIR |
DEST_AIR |
SCHED_DEP |
DEP_DELAY |
AIR_TIME |
DIST |
SCHED_ARR |
ARR_DELAY |
DIVERTED |
CANCELLED |
0 |
1 |
1 |
4 |
WN |
LAX |
SLC |
1625 |
58.0 |
94.0 |
590 |
1905 |
65.0 |
0 |
0 |
1 |
1 |
1 |
4 |
UA |
DEN |
IAD |
823 |
7.0 |
154.0 |
1452 |
1333 |
-13.0 |
0 |
0 |
2 |
1 |
1 |
4 |
MQ |
DFW |
VPS |
1305 |
36.0 |
85.0 |
641 |
1453 |
35.0 |
0 |
0 |
3 |
1 |
1 |
4 |
AA |
DFW |
DCA |
1555 |
7.0 |
126.0 |
1192 |
1935 |
-7.0 |
0 |
0 |
4 |
1 |
1 |
4 |
WN |
LAX |
MCI |
1720 |
48.0 |
166.0 |
1363 |
2225 |
39.0 |
0 |
0 |
# 创建两列,表示延迟和准时
flights['DELAYED'] = flights['ARR_DELAY'].ge(15).astype(int) #大于等于15分钟的是延迟
cols = ['DIVERTED', 'CANCELLED', 'DELAYED']
cols
flights['ON_TIME'] = 1 - flights[cols].any(axis=1) #准时的等于总共的减去取消的,延迟的,转机的
cols.append('ON_TIME')
#clos=['DIVERTED', 'CANCELLED', 'DELAYED', 'ON_TIME']
status = flights[cols].sum()
status
'''
DIVERTED 137
CANCELLED 881
DELAYED 11685
ON_TIME 45789
dtype: int64
'''
# 对类型值和连续值列作图
fig, ax_array = plt.subplots(2, 3, figsize=(18,8)) #rows=2,cols=3
(ax1, ax2, ax3), (ax4, ax5, ax6) = ax_array
fig.suptitle('2015 US Flights - Univariate Summary', size=20)
ac = flights['AIRLINE'].value_counts()
ac.plot(kind='barh', ax=ax1, title ='Airline') #横向条形图
oc = flights['ORG_AIR'].value_counts()
oc.plot(kind='bar', ax=ax2, rot=0, title='Origin City') # Bar:柱状图/条形图
dc = flights['DEST_AIR'].value_counts().head(10)
dc.plot(kind='bar', ax=ax3, rot=0, title='Destination City') # Bar:柱状图/条形图
status.plot(kind='bar', ax=ax4, rot=0, log=True, title='Flight Status')
flights['DIST'].plot(kind='kde', ax=ax5, xlim=(0, 3000),
title='Distance KDE')
flights['ARR_DELAY'].plot(kind='hist', ax=ax6, #直方图
title='Arrival Delay', range=(0,200))
# 添加关于年的列,用起飞时间得到小时和分钟
hour = flights['SCHED_DEP'] // 100
minute = flights['SCHED_DEP'] % 100
df_date = flights[['MONTH', 'DAY']].assign(YEAR=2015, HOUR=hour, MINUTE=minute)
df_date.head()
|
MONTH |
DAY |
YEAR |
HOUR |
MINUTE |
0 |
1 |
1 |
2015 |
16 |
25 |
1 |
1 |
1 |
2015 |
8 |
23 |
2 |
1 |
1 |
2015 |
13 |
5 |
3 |
1 |
1 |
2015 |
15 |
55 |
4 |
1 |
1 |
2015 |
17 |
20 |
# 用to_datetime函数,将df_date变为Timestamps对象
flight_dep = pd.to_datetime(df_date)
flight_dep.head()
'''
0 2015-01-01 16:25:00
1 2015-01-01 08:23:00
2 2015-01-01 13:05:00
3 2015-01-01 15:55:00
4 2015-01-01 17:20:00
dtype: datetime64[ns]
'''
# 用flight_dep作为新的行索引,并根据它统计每周的航班数
flights.index = flight_dep
fc = flights.resample('W').size() # Pandas中的resample,重新采样,是对原样本重新处理的一个方法,
#是一个对常规时间序列数据重新采样和频率转换的便捷的方法。
fc.plot(figsize=(12,3), title='Flights per Week', grid=True)
# 如果航班数小于1000,则将其当做缺失值。然后用interpolate方法填补缺失值
fc_miss = fc.where(fc > 1000)
fc_intp = fc_miss.interpolate(limit_direction='both')
ax = fc_intp.plot(color='black', figsize=(16,4))
fc_intp[fc < 500].plot(linewidth=10, grid=True,
color='.8', ax=ax)
ax.annotate(xy=(.8, .55), xytext=(.8, .77),
xycoords='axes fraction', s='missing data',
ha='center', size=20, arrowprops=dict())
ax.set_title('Flights per Week (Interpolated Missing Data)')
Text(0.5, 1.0, 'Flights per Week (Interpolated Missing Data)')
# 找到10个有最长平均入境航班航程、最少100航次的机场
flights.groupby('DEST_AIR')['DIST'] \
.agg(['mean', 'count']) \
.query('count > 100') \
.sort_values('mean') \
.tail(10) \
.plot(kind='bar', y='mean', legend=False,
rot=0, figsize=(14,4),
title='Average Distance per Destination')
# 画出航班时间和航程的散点图
fs = flights.reset_index(drop=True)[['DIST', 'AIR_TIME']].query('DIST <= 2000').dropna()
fs.plot(x='DIST', y='AIR_TIME', kind='scatter', s=1, figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x260735471c0>
# 用cut函数,将航班距离分成八组
fs['DIST_GROUP'] = pd.cut(fs['DIST'], bins=range(0, 2001, 250))
fs['DIST_GROUP'].value_counts().sort_index()
'''
(0, 250] 6529
(250, 500] 12631
(500, 750] 11506
(750, 1000] 8832
(1000, 1250] 5071
(1250, 1500] 3198
(1500, 1750] 3885
(1750, 2000] 1815
Name: DIST_GROUP, dtype: int64
'''
# 计算每组的标准差
normalize = lambda x: (x - x.mean()) / x.std()
fs['TIME_SCORE'] = fs.groupby('DIST_GROUP')['AIR_TIME'] \
.transform(normalize)
fs.head()
|
DIST |
AIR_TIME |
DIST_GROUP |
TIME_SCORE |
0 |
590 |
94.0 |
(500, 750] |
0.490966 |
1 |
1452 |
154.0 |
(1250, 1500] |
-1.267551 |
2 |
641 |
85.0 |
(500, 750] |
-0.296749 |
3 |
1192 |
126.0 |
(1000, 1250] |
-1.211020 |
4 |
1363 |
166.0 |
(1250, 1500] |
-0.521999 |
# 用boxplot方法画出异常值
ax = fs.boxplot(by='DIST_GROUP', column='TIME_SCORE', figsize=(16,4))
ax.set_title('Z-Scores for Distance Groups')
ax.figure.suptitle('')
# 检查超出6个标准偏差的点。用一个DataFrame记录异常点。
outliers = flights.iloc[fs[fs['TIME_SCORE'] > 6].index]
outliers = outliers[['AIRLINE','ORG_AIR', 'DEST_AIR', 'AIR_TIME',
'DIST', 'ARR_DELAY', 'DIVERTED']]
outliers['PLOT_NUM'] = range(1, len(outliers) + 1)
outliers
|
AIRLINE |
ORG_AIR |
DEST_AIR |
AIR_TIME |
DIST |
ARR_DELAY |
DIVERTED |
PLOT_NUM |
2015-04-08 09:40:00 |
DL |
ATL |
CVG |
121.0 |
373 |
54.0 |
0 |
1 |
2015-05-25 16:30:00 |
F9 |
MSP |
ATL |
199.0 |
907 |
79.0 |
0 |
2 |
2015-09-10 20:00:00 |
UA |
IAH |
MCI |
176.0 |
643 |
76.0 |
0 |
3 |
2015-12-10 19:53:00 |
OO |
PHX |
SFO |
164.0 |
651 |
146.0 |
0 |
4 |
2015-12-26 09:15:00 |
NK |
ORD |
DFW |
210.0 |
802 |
98.0 |
0 |
5 |
# 可以这张表的数据确定异常值。pandas提供了将表格附加于图片底部的方法。
ax = fs.plot(x='DIST', y='AIR_TIME',
kind='scatter', s=1,
figsize=(16,4), table=outliers)
outliers.plot(x='DIST', y='AIR_TIME',
kind='scatter', s=25, ax=ax, grid=True)
outs = outliers[['AIR_TIME', 'DIST', 'PLOT_NUM']]
for t, d, n in outs.itertuples(index=False):
ax.text(d + 5, t + 5, str(n))
plt.setp(ax.get_xticklabels(), y=.1)
plt.setp(ax.get_xticklines(), visible=False)
ax.set_xlabel('')
ax.set_title('Flight Time vs Distance with Outliers')