智能切分时间序列

# 从hdf5文件crime.h5读取丹佛市的crimes数据集,输出列数据的数据类型和数据的前几行
crime = pd.read_hdf('data/crime.h5', 'crime')
crime.dtypes
'''
OFFENSE_TYPE_ID              category
OFFENSE_CATEGORY_ID          category
REPORTED_DATE          datetime64[ns]
GEO_LON                       float64
GEO_LAT                       float64
NEIGHBORHOOD_ID              category
IS_CRIME                        int64
IS_TRAFFIC                      int64
dtype: object
'''
crime = crime.set_index('REPORTED_DATE')
crime.head()
  OFFENSE_TYPE_ID OFFENSE_CATEGORY_ID GEO_LON GEO_LAT NEIGHBORHOOD_ID IS_CRIME IS_TRAFFIC
REPORTED_DATE              
2014-06-29 02:01:00 traffic-accident-dui-duid traffic-accident -105.000149 39.745753 cbd 0 1
2014-06-29 01:54:00 vehicular-eluding-no-chase all-other-crimes -104.884660 39.738702 east-colfax 1 0
2014-06-29 02:00:00 disturbing-the-peace public-disorder -105.020719 39.706674 athmar-park 1 0
2014-06-29 02:18:00 curfew public-disorder -105.001552 39.769505 sunnyside 1 0
2014-06-29 04:17:00 aggravated-assault aggravated-assault -105.018557 39.679229 college-view-south-platte 1 0
# 注意到有三个类型列和一个Timestamp对象列,这些数据的数据类型在创建时就建立了对应的数据类型。
# 这和csv文件非常不同,csv文件保存的只是字符串。
# 由于前面已经将REPORTED_DATE设为了行索引,所以就可以进行智能Timestamp对象切分。
pd.options.display.max_rows = 4
crime.loc['2016-05-12 16:45:00']
  OFFENSE_TYPE_ID OFFENSE_CATEGORY_ID GEO_LON GEO_LAT NEIGHBORHOOD_ID IS_CRIME IS_TRAFFIC
REPORTED_DATE              
2016-05-12 16:45:00 traffic-accident traffic-accident -104.847024 39.779596 montbello 0 1
2016-05-12 16:45:00 traffic-accident traffic-accident -105.049180 39.769296 west-highland 0 1
2016-05-12 16:45:00 fraud-identity-theft white-collar-crime -104.931971 39.717359 hilltop 1 0
# 可以进行时间部分匹配
crime.loc['2016-05-12']
  OFFENSE_TYPE_ID OFFENSE_CATEGORY_ID GEO_LON GEO_LAT NEIGHBORHOOD_ID IS_CRIME IS_TRAFFIC
REPORTED_DATE              
2016-05-12 23:51:00 criminal-mischief-other public-disorder -105.017241 39.705845 athmar-park 1 0
2016-05-12 18:40:00 liquor-possession drug-alcohol -104.995692 39.747875 cbd 1 0
... ... ... ... ... ... ... ...
2016-05-12 15:59:00 menacing-felony-w-weap aggravated-assault -104.935172 39.723703 hilltop 1 0
2016-05-12 16:39:00 assault-dv other-crimes-against-persons -104.974700 39.740555 north-capitol-hill 1 0

243 rows × 7 columns

# 也可以选取一整月、一整年或某天的某小时
crime.loc['2016-05'].shape
#(8012, 7)
crime.loc['2016'].shape
#(91076, 7)
crime.loc['2016-05-12 03'].shape
#(4, 7)
# 也可以包含月的名字
crime.loc['Dec 2015'].sort_index()
  OFFENSE_TYPE_ID OFFENSE_CATEGORY_ID GEO_LON GEO_LAT NEIGHBORHOOD_ID IS_CRIME IS_TRAFFIC
REPORTED_DATE              
2015-12-01 00:48:00 drug-cocaine-possess drug-alcohol -104.891681 39.740155 east-colfax 1 0
2015-12-01 00:48:00 theft-of-motor-vehicle auto-theft -104.891681 39.740155 east-colfax 1 0
... ... ... ... ... ... ... ...
2015-12-31 23:45:00 violation-of-restraining-order all-other-crimes -105.034887 39.741827 west-colfax 1 0
2015-12-31 23:50:00 weapon-poss-illegal-dangerous all-other-crimes -105.032769 39.709188 westwood 1 0

6907 rows × 7 columns

# 其它一些字符串的格式也可行
crime.loc['2016 Sep, 15'].shape
#(252, 7)
crime.loc['21st October 2014 05'].shape
#(4, 7)
# 可以进行切片
crime.loc['2015-3-4':'2016-1-1'].sort_index()
  OFFENSE_TYPE_ID OFFENSE_CATEGORY_ID GEO_LON GEO_LAT NEIGHBORHOOD_ID IS_CRIME IS_TRAFFIC
REPORTED_DATE              
2015-03-04 00:11:00 assault-dv other-crimes-against-persons -105.021966 39.770883 sunnyside 1 0
2015-03-04 00:19:00 assault-dv other-crimes-against-persons -104.978988 39.748799 five-points 1 0
... ... ... ... ... ... ... ...
2016-01-01 23:45:00 drug-cocaine-possess drug-alcohol -104.987310 39.753598 five-points 1 0
2016-01-01 23:48:00 drug-poss-paraphernalia drug-alcohol -104.986020 39.752541 five-points 1 0

75403 rows × 7 columns

# 提供更为精确的时间
crime.loc['2015-3-4 22':'2016-1-1 23:45:00'].sort_index()
  OFFENSE_TYPE_ID OFFENSE_CATEGORY_ID GEO_LON GEO_LAT NEIGHBORHOOD_ID IS_CRIME IS_TRAFFIC
REPORTED_DATE              
2015-03-04 22:25:00 traffic-accident-hit-and-run traffic-accident -104.973896 39.769064 five-points 0 1
2015-03-04 22:30:00 traffic-accident traffic-accident -104.906412 39.632816 hampden-south 0 1
... ... ... ... ... ... ... ...
2016-01-01 23:40:00 robbery-business robbery -105.039236 39.726157 villa-park 1 0
2016-01-01 23:45:00 drug-cocaine-possess drug-alcohol -104.987310 39.753598 five-points 1 0

75175 rows × 7 columns

原理

# hdf5文件可以保存每一列的数据类型,可以极大减少内存的使用。
# 在上面的例子中,三个列被存成了类型,而不是对象。存成对象的话,消耗的内存会变为之前的四倍。
mem_cat = crime.memory_usage().sum()
mem_obj = crime.astype({'OFFENSE_TYPE_ID':'object', 
                                 'OFFENSE_CATEGORY_ID':'object', 
                                 'NEIGHBORHOOD_ID':'object'}).memory_usage(deep=True)\
                                                    .sum()

mb = 2 ** 20 
#1KB= 1024Bytes =2的10次方Bytes 1MB= 1024KB =2的20次方Bytes 1GB= 1024MB = 2的30次方Bytes 8
round(mem_cat / mb, 1), round(mem_obj / mb, 1)
#(29.4, 122.7)
# 为了用日期智能选取和切分,行索引必须包含日期。
# 在前面的例子中,REPORTED_DATE被设成了行索引,行索引从而成了DatetimeIndex对象。
crime.index[:2]
#DatetimeIndex(['2014-06-29 02:01:00', '2014-06-29 01:54:00'], dtype='datetime64[ns]', name='REPORTED_DATE', freq=None)

如果索引是datetime64类型,对行索引切片,可以先排序

#如果索引是datetime64类型,对行索引切片,可以先排序
%timeit crime.loc['2015-3-4':'2016-1-1']
#27.8 ms ± 5.27 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
crime_sort = crime.sort_index()
%timeit crime_sort.loc['2015-3-4':'2016-1-1']
#4.02 ms ± 1.03 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
pd.options.display.max_rows = 6 #设置notebook一个cell的显示行数

猜你喜欢

转载自blog.csdn.net/weixin_48135624/article/details/114213215