In [1]:
import pandas as pd
gl = pd.read_csv('game_logs.csv')
gl.head()
Out[1]:
In [2]:
gl.shape
Out[2]:
In [3]:
gl.info(memory_usage='deep')
In [4]:
for dtype in ['float64','int64','object']:
selected_dtype = gl.select_dtypes(include = [dtype])
mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
mean_usage_mb = mean_usage_b/1024**2
print ('平均内存占用',dtype,mean_usage_mb)
In [5]:
import numpy as np
int_types = ['uint8','int8','int16','int32','int64']
for it in int_types:
print (np.iinfo(it))
In [6]:
def mem_usage(pandas_obj):
if isinstance(pandas_obj,pd.DataFrame):
usage_b = pandas_obj.memory_usage(deep=True).sum()
else:
usage_b = pandas_obj.memory_usage(deep=True)
usage_mb = usage_b/1024**2
return '{:03.2f} MB'.format(usage_mb)
gl_int = gl.select_dtypes(include = ['int64'])
coverted_int = gl_int.apply(pd.to_numeric,downcast='unsigned')
print (mem_usage(gl_int))
print (mem_usage(coverted_int))
In [7]:
gl_float = gl.select_dtypes(include=['float64'])
converted_float = gl_float.apply(pd.to_numeric,downcast='float')
print(mem_usage(gl_float))
print(mem_usage(converted_float))
In [8]:
optimized_gl = gl.copy()
optimized_gl[coverted_int.columns] = coverted_int
optimized_gl[converted_float.columns] = converted_float
print(mem_usage(gl))
print(mem_usage(optimized_gl))
In [9]:
gl_obj = gl.select_dtypes(include = ['object']).copy()
gl_obj.describe()
Out[9]:
In [10]:
dow = gl_obj.day_of_week
dow.head()
Out[10]:
In [11]:
dow_cat = dow.astype('category')
dow_cat.head()
Out[11]:
In [13]:
dow_cat.head(10).cat.codes
Out[13]:
In [14]:
print (mem_usage(dow))
print (mem_usage(dow_cat))
In [15]:
converted_obj = pd.DataFrame()
for col in gl_obj.columns:
num_unique_values = len(gl_obj[col].unique())
num_total_values = len(gl_obj[col])
if num_unique_values / num_total_values < 0.5:
converted_obj.loc[:,col] = gl_obj[col].astype('category')
else:
converted_obj.loc[:,col] = gl_obj[col]
In [16]:
print(mem_usage(gl_obj))
print(mem_usage(converted_obj))
In [19]:
date = optimized_gl.date
date[:5]
Out[19]:
In [20]:
print (mem_usage(date))
In [21]:
optimized_gl['date'] = pd.to_datetime(date,format='%Y%m%d')
print (mem_usage(optimized_gl['date']))
In [22]:
optimized_gl['date'][:5]
Out[22]:
In [ ]: