# 从hdf5文件crime.h5读取丹佛市的crimes数据集,输出列数据的数据类型和数据的前几行
crime = pd.read_hdf('data/crime.h5', 'crime')
crime.dtypes
'''
OFFENSE_TYPE_ID category
OFFENSE_CATEGORY_ID category
REPORTED_DATE datetime64[ns]
GEO_LON float64
GEO_LAT float64
NEIGHBORHOOD_ID category
IS_CRIME int64
IS_TRAFFIC int64
dtype: object
'''
crime = crime.set_index('REPORTED_DATE')
crime.head()
|
OFFENSE_TYPE_ID |
OFFENSE_CATEGORY_ID |
GEO_LON |
GEO_LAT |
NEIGHBORHOOD_ID |
IS_CRIME |
IS_TRAFFIC |
REPORTED_DATE |
|
|
|
|
|
|
|
2014-06-29 02:01:00 |
traffic-accident-dui-duid |
traffic-accident |
-105.000149 |
39.745753 |
cbd |
0 |
1 |
2014-06-29 01:54:00 |
vehicular-eluding-no-chase |
all-other-crimes |
-104.884660 |
39.738702 |
east-colfax |
1 |
0 |
2014-06-29 02:00:00 |
disturbing-the-peace |
public-disorder |
-105.020719 |
39.706674 |
athmar-park |
1 |
0 |
2014-06-29 02:18:00 |
curfew |
public-disorder |
-105.001552 |
39.769505 |
sunnyside |
1 |
0 |
2014-06-29 04:17:00 |
aggravated-assault |
aggravated-assault |
-105.018557 |
39.679229 |
college-view-south-platte |
1 |
0 |
# 注意到有三个类型列和一个Timestamp对象列,这些数据的数据类型在创建时就建立了对应的数据类型。
# 这和csv文件非常不同,csv文件保存的只是字符串。
# 由于前面已经将REPORTED_DATE设为了行索引,所以就可以进行智能Timestamp对象切分。
pd.options.display.max_rows = 4
crime.loc['2016-05-12 16:45:00']
|
OFFENSE_TYPE_ID |
OFFENSE_CATEGORY_ID |
GEO_LON |
GEO_LAT |
NEIGHBORHOOD_ID |
IS_CRIME |
IS_TRAFFIC |
REPORTED_DATE |
|
|
|
|
|
|
|
2016-05-12 16:45:00 |
traffic-accident |
traffic-accident |
-104.847024 |
39.779596 |
montbello |
0 |
1 |
2016-05-12 16:45:00 |
traffic-accident |
traffic-accident |
-105.049180 |
39.769296 |
west-highland |
0 |
1 |
2016-05-12 16:45:00 |
fraud-identity-theft |
white-collar-crime |
-104.931971 |
39.717359 |
hilltop |
1 |
0 |
# 可以进行时间部分匹配
crime.loc['2016-05-12']
|
OFFENSE_TYPE_ID |
OFFENSE_CATEGORY_ID |
GEO_LON |
GEO_LAT |
NEIGHBORHOOD_ID |
IS_CRIME |
IS_TRAFFIC |
REPORTED_DATE |
|
|
|
|
|
|
|
2016-05-12 23:51:00 |
criminal-mischief-other |
public-disorder |
-105.017241 |
39.705845 |
athmar-park |
1 |
0 |
2016-05-12 18:40:00 |
liquor-possession |
drug-alcohol |
-104.995692 |
39.747875 |
cbd |
1 |
0 |
... |
... |
... |
... |
... |
... |
... |
... |
2016-05-12 15:59:00 |
menacing-felony-w-weap |
aggravated-assault |
-104.935172 |
39.723703 |
hilltop |
1 |
0 |
2016-05-12 16:39:00 |
assault-dv |
other-crimes-against-persons |
-104.974700 |
39.740555 |
north-capitol-hill |
1 |
0 |
243 rows × 7 columns
# 也可以选取一整月、一整年或某天的某小时
crime.loc['2016-05'].shape
#(8012, 7)
crime.loc['2016'].shape
#(91076, 7)
crime.loc['2016-05-12 03'].shape
#(4, 7)
# 也可以包含月的名字
crime.loc['Dec 2015'].sort_index()
|
OFFENSE_TYPE_ID |
OFFENSE_CATEGORY_ID |
GEO_LON |
GEO_LAT |
NEIGHBORHOOD_ID |
IS_CRIME |
IS_TRAFFIC |
REPORTED_DATE |
|
|
|
|
|
|
|
2015-12-01 00:48:00 |
drug-cocaine-possess |
drug-alcohol |
-104.891681 |
39.740155 |
east-colfax |
1 |
0 |
2015-12-01 00:48:00 |
theft-of-motor-vehicle |
auto-theft |
-104.891681 |
39.740155 |
east-colfax |
1 |
0 |
... |
... |
... |
... |
... |
... |
... |
... |
2015-12-31 23:45:00 |
violation-of-restraining-order |
all-other-crimes |
-105.034887 |
39.741827 |
west-colfax |
1 |
0 |
2015-12-31 23:50:00 |
weapon-poss-illegal-dangerous |
all-other-crimes |
-105.032769 |
39.709188 |
westwood |
1 |
0 |
6907 rows × 7 columns
# 其它一些字符串的格式也可行
crime.loc['2016 Sep, 15'].shape
#(252, 7)
crime.loc['21st October 2014 05'].shape
#(4, 7)
# 可以进行切片
crime.loc['2015-3-4':'2016-1-1'].sort_index()
|
OFFENSE_TYPE_ID |
OFFENSE_CATEGORY_ID |
GEO_LON |
GEO_LAT |
NEIGHBORHOOD_ID |
IS_CRIME |
IS_TRAFFIC |
REPORTED_DATE |
|
|
|
|
|
|
|
2015-03-04 00:11:00 |
assault-dv |
other-crimes-against-persons |
-105.021966 |
39.770883 |
sunnyside |
1 |
0 |
2015-03-04 00:19:00 |
assault-dv |
other-crimes-against-persons |
-104.978988 |
39.748799 |
five-points |
1 |
0 |
... |
... |
... |
... |
... |
... |
... |
... |
2016-01-01 23:45:00 |
drug-cocaine-possess |
drug-alcohol |
-104.987310 |
39.753598 |
five-points |
1 |
0 |
2016-01-01 23:48:00 |
drug-poss-paraphernalia |
drug-alcohol |
-104.986020 |
39.752541 |
five-points |
1 |
0 |
75403 rows × 7 columns
# 提供更为精确的时间
crime.loc['2015-3-4 22':'2016-1-1 23:45:00'].sort_index()
|
OFFENSE_TYPE_ID |
OFFENSE_CATEGORY_ID |
GEO_LON |
GEO_LAT |
NEIGHBORHOOD_ID |
IS_CRIME |
IS_TRAFFIC |
REPORTED_DATE |
|
|
|
|
|
|
|
2015-03-04 22:25:00 |
traffic-accident-hit-and-run |
traffic-accident |
-104.973896 |
39.769064 |
five-points |
0 |
1 |
2015-03-04 22:30:00 |
traffic-accident |
traffic-accident |
-104.906412 |
39.632816 |
hampden-south |
0 |
1 |
... |
... |
... |
... |
... |
... |
... |
... |
2016-01-01 23:40:00 |
robbery-business |
robbery |
-105.039236 |
39.726157 |
villa-park |
1 |
0 |
2016-01-01 23:45:00 |
drug-cocaine-possess |
drug-alcohol |
-104.987310 |
39.753598 |
five-points |
1 |
0 |
75175 rows × 7 columns
原理
# hdf5文件可以保存每一列的数据类型,可以极大减少内存的使用。
# 在上面的例子中,三个列被存成了类型,而不是对象。存成对象的话,消耗的内存会变为之前的四倍。
mem_cat = crime.memory_usage().sum()
mem_obj = crime.astype({'OFFENSE_TYPE_ID':'object',
'OFFENSE_CATEGORY_ID':'object',
'NEIGHBORHOOD_ID':'object'}).memory_usage(deep=True)\
.sum()
mb = 2 ** 20
#1KB= 1024Bytes =2的10次方Bytes 1MB= 1024KB =2的20次方Bytes 1GB= 1024MB = 2的30次方Bytes 8
round(mem_cat / mb, 1), round(mem_obj / mb, 1)
#(29.4, 122.7)
# 为了用日期智能选取和切分,行索引必须包含日期。
# 在前面的例子中,REPORTED_DATE被设成了行索引,行索引从而成了DatetimeIndex对象。
crime.index[:2]
#DatetimeIndex(['2014-06-29 02:01:00', '2014-06-29 01:54:00'], dtype='datetime64[ns]', name='REPORTED_DATE', freq=None)
如果索引是datetime64类型,对行索引切片,可以先排序
#如果索引是datetime64类型,对行索引切片,可以先排序
%timeit crime.loc['2015-3-4':'2016-1-1']
#27.8 ms ± 5.27 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
crime_sort = crime.sort_index()
%timeit crime_sort.loc['2015-3-4':'2016-1-1']
#4.02 ms ± 1.03 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
pd.options.display.max_rows = 6 #设置notebook一个cell的显示行数