pd.set_option('max_rows',12)
df_object = DataFrame({'B' :
Series(['a','foo','bar',
'a really long string','baz'])})
df_cat = df_object.copy()
df_cat['B'] = df_cat['B'].astype('category')
df_object = pd.concat([df_object]*100000,ignore_index=True)
df_cat = pd.concat([df_cat]*100000,ignore_index=True)
df_object
B | |
---|---|
0 | a |
1 | foo |
2 | bar |
3 | a really long string |
4 | baz |
5 | a |
... | ... |
499994 | baz |
499995 | a |
499996 | foo |
499997 | bar |
499998 | a really long string |
499999 | baz |
500000 rows × 1 columns
df_object.memory_usage()
B 4000000 dtype: int64
def as_mb(v):
return "%.1f MB" % (v/(1024.0*1024))
# what python actually
import sys
as_mb(sum(map(sys.getsizeof,df_object['B'].values)))
'20.5 MB'
# approx fixed-len string storage
as_mb(df_object['B'].values.astype(str).nbytes)
'9.5 MB'
# approx fixed-len string storage , not including the pointers
as_mb(df_object['B'].values.astype(str).nbytes - df_object.memory_usage()['B'])
'5.7 MB'
df_cat.memory_usage()
B 500040 dtype: int64
df_cat.B.cat.categories.nbytes
40
df_cat.B.cat.categories
Index([u'a', u'a really long string', u'bar', u'baz', u'foo'], dtype='object')
df_cat.B.cat.codes
0 0 1 4 2 2 3 1 4 3 5 0 .. 499994 3 499995 0 499996 4 499997 2 499998 1 499999 3 dtype: int8
df_object.to_hdf('data/test_object.h5','df',mode='w',data_columns=True,format='table')
df_cat.to_hdf('data/test_cat.h5','df',mode='w',data_columns=True,format='table')
!ls -ltr data/*.h5
-rw-rw-r-- 1 jreback staff 84472 Jun 17 22:06 data/test_iterator.h5 -rw-rw-r-- 1 jreback staff 14749644 Jun 18 10:14 data/test_object.h5 -rw-rw-r-- 1 jreback staff 5290697 Jun 18 10:14 data/test_cat.h5
with pd.get_store('data/test_cat.h5') as store:
print store
<class 'pandas.io.pytables.HDFStore'> File path: data/test_cat.h5 /df frame_table (typ->appendable,nrows->500000,ncols->1,indexers->[index],dc->[B]) /df/meta/B/meta series_table (typ->appendable,nrows->5,ncols->1,indexers->[index],dc->[values])