import pandas as pd import numpy as np import pylab as plt import blz import sys import csv from time import time from shutil import rmtree from collections import defaultdict t1 = time() df = pd.read_hdf('h5/msft-18-SEP-2013-blosc9.h5', '/msft') t2 = time() print "Reading h5: " + str(t2 - t1) dt = np.dtype([(k, v if v.kind!='O' else 'S49') for k,v in df.dtypes.iteritems()]) t1 = time() btm = blz.fromiter((i[1:] for i in df.itertuples()), dtype=dt, count=len(df), bparams=blz.bparams(clevel=0)) t2 = time() print "Converting h5 to btable in memory: " + str(t2 - t1) #If the blz already exist, remove it rmtree('blz/msft-18-SEP-2013-blosc9.blz', ignore_errors=True) t1 = time() btd = btm.copy(rootdir='blz/msft-18-SEP-2013-blosc9.blz', bparams=blz.bparams(clevel=0)) t2 = time() print "Copying btable from memory to disk: " + str(t2-t1) def get_OHLC_points(src): windows = [] window_size = 79 sample_size = 2455 iter = src.where('(Type == \'Trade\')') for i in xrange(sample_size): local_max = - np.inf local_min = np.inf average_price = 0 for j in xrange(window_size): current = iter.next() price = current[7] average_price += price if(price > local_max): local_max = price if(price < local_min): local_min = price if(j == 0): local_open = price if(j == (window_size - 1)): local_close = price windows.append([average_price/float(window_size), local_open, local_max, local_min, local_close]) def copy(src, memory, clevel=5, shuffle=True, cname="blosclz"): """ Parameters ---------- clevel : int (0 <= clevel < 10) The compression level. shuffle : bool Whether the shuffle filter is active or not. cname : string ('blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', others?) Select the compressor to use inside Blosc. """ if memory == True: copied = src.copy(bparams=blz.bparams(clevel=clevel, shuffle=shuffle, cname=cname)) else: copied = src.copy(rootdir='blz/temp.blz', bparams=blz.bparams(clevel=clevel, shuffle=shuffle, cname=cname)) copied.flush() return copied def benchmark(data, cmethods, memory): if memory == True: base_name = 'csv/bbtablem_' else: base_name = 'csv/bbtable_' for method in cmethods: myfile = open(base_name + method + '.csv', 'wb') wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) wr.writerow(['Compression level', 'Compressed size', 'Compression ratio', 'Compression time', 'OHLC time', 'Writing speed (MiB/s)', 'OHLC speed (MiB/s)']) for compression_level in xrange(0,10): #I use data so I only keep one copy in memory tc1 = time() data = copy(data, memory, compression_level, True, method) tc2 = time() #Now I get the poins and measure the time t1 = time() get_OHLC_points(data) t2 = time() #Uncompressed size in MiB uncompressed = data.nbytes/(2**20) #I should store stuff row = [compression_level, data.cbytes, round(data.nbytes/float(data.cbytes),3), str(tc2 - tc1), str(t2 - t1), uncompressed/(tc2 - tc1), uncompressed/(t2 - t1)] #Add it to the csv wr.writerow(row) rmtree('blz/temp.blz', ignore_errors=True) myfile.close() cmethods = ['blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib'] t1 = time() benchmark(btd, cmethods, False) t2 = time() t3 = time() benchmark(btm, cmethods, True) t4 = time() print 'Cached disk: ' + str(t2-t1) print 'Memory only: ' + str(t4-t3) print 'blosclz data' print 'Memory' df = pd.read_csv('csv/bbtablem_blosclz.csv') df print 'blosclz data' print 'Disk' df = pd.read_csv('csv/bbtable_blosclz.csv') df print 'lz4 data' print 'Memory' df = pd.read_csv('csv/bbtablem_lz4.csv') df print 'lz4 data' print 'Disk' df = pd.read_csv('csv/bbtable_lz4.csv') df print 'lz4hc data' print 'Memory' df = pd.read_csv('csv/bbtablem_lz4hc.csv') df print 'lz4hc data' print 'Disk' df = pd.read_csv('csv/bbtable_lz4hc.csv') df print 'snappy data' print 'Memory' df = pd.read_csv('csv/bbtablem_snappy.csv') df print 'snappy data' print 'Disk' df = pd.read_csv('csv/bbtable_snappy.csv') df print 'zlib data' print 'Memory' df = pd.read_csv('csv/bbtablem_zlib.csv') df print 'zlib data' print 'Disk' df = pd.read_csv('csv/bbtable_zlib.csv') df def get_dict(filename): columns = defaultdict(list) with open(filename) as f: reader = csv.reader(f) reader.next() for row in reader: for (i,v) in enumerate(row): if i == 1: columns[i].append(float(v)/131072) continue columns[i].append(v) return columns #In memory csv blosclzm = get_dict('csv/bbtablem_blosclz.csv') lz4m = get_dict('csv/bbtablem_lz4.csv') lz4hcm = get_dict('csv/bbtablem_lz4hc.csv') snappym = get_dict('csv/bbtablem_snappy.csv') zlibm = get_dict('csv/bbtablem_zlib.csv') #In disk blosclz = get_dict('csv/bbtable_blosclz.csv') lz4 = get_dict('csv/bbtable_lz4.csv') lz4hc = get_dict('csv/bbtable_lz4hc.csv') snappy = get_dict('csv/bbtable_snappy.csv') zlib = get_dict('csv/bbtable_zlib.csv') %matplotlib inline #Matplotlib magic fig = plt.figure(num=None, figsize=(18, 9), dpi=80, facecolor='w', edgecolor='k') ax = fig.add_subplot(111) ax1 = fig.add_subplot(231) ax2 = fig.add_subplot(232) ax3 = fig.add_subplot(233) ax4 = fig.add_subplot(234) ax5 = fig.add_subplot(235) plt.subplots_adjust(wspace=0.6, hspace=0.6) ax.spines['top'].set_color('none') ax.spines['bottom'].set_color('none') ax.spines['left'].set_color('none') ax.spines['right'].set_color('none') ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off') ax.set_xlabel('Compression ratio') ax.set_ylabel('Compression level') #blosclz ax1.set_title('blosclz') ax1.plot(blosclzm[2][1:], blosclzm[0][1:], 'bo-') ax1.plot(blosclz[2][1:], blosclz[0][1:], 'ro-') #lz4 ax2.set_title('lz4') ax2.plot(lz4m[2][1:], blosclzm[0][1:], 'bo-') ax2.plot(lz4[2][1:], blosclz[0][1:], 'ro-') #lz4hc ax3.set_title('lz4hc') ax3.plot(lz4hcm[2][1:], lz4hcm[0][1:], 'bo-') ax3.plot(lz4hc[2][1:], lz4hc[0][1:], 'ro-') #snappy ax4.set_title('snappy') ax4.plot(snappym[2][1:], snappym[0][1:], 'bo-') ax4.plot(snappy[2][1:], snappy[0][1:], 'ro-') #zlib ax5.set_title('zlib') ax5.plot(zlibm[2][1:], zlibm[0][1:], 'bo-', label='Memory') ax5.plot(zlib[2][1:], zlib[0][1:], 'ro-', label='Disk') #Legend ax5.legend(bbox_to_anchor=(2, 1)) plt.show() #Matplotlib magic fig = plt.figure(num=None, figsize=(18, 9), dpi=80, facecolor='w', edgecolor='k') ax = fig.add_subplot(111) ax1 = fig.add_subplot(231) ax2 = fig.add_subplot(232) ax3 = fig.add_subplot(233) ax4 = fig.add_subplot(234) ax5 = fig.add_subplot(235) plt.subplots_adjust(wspace=0.6, hspace=0.6) ax.spines['top'].set_color('none') ax.spines['bottom'].set_color('none') ax.spines['left'].set_color('none') ax.spines['right'].set_color('none') ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off') ax.set_ylabel('Compression time (s)') ax.set_xlabel('Compression ratio') #blosclz ax1.set_title('blosclz') ax1.plot(blosclzm[2][1:], blosclzm[3][1:], 'bo-') ax1.plot(blosclz[2][1:], blosclz[3][1:], 'ro-') #lz4 ax2.set_title('lz4') ax2.plot(lz4m[2][1:], blosclzm[3][1:], 'bo-') ax2.plot(lz4[2][1:], blosclz[3][1:], 'ro-') #lz4hc ax3.set_title('lz4hc') ax3.plot(lz4hcm[2][1:], lz4hcm[3][1:], 'bo-') ax3.plot(lz4hc[2][1:], lz4hc[3][1:], 'ro-') #snappy ax4.set_title('snappy') ax4.plot(snappym[2][1:], snappym[3][1:], 'bo-') ax4.plot(snappy[2][1:], snappy[3][1:], 'ro-') #zlib ax5.set_title('zlib') ax5.plot(zlibm[2][1:], zlibm[3][1:], 'bo-', label='Memory') ax5.plot(zlib[2][1:], zlib[3][1:], 'ro-', label='Disk') #Legend ax5.legend(bbox_to_anchor=(2, 1)) plt.show() #Matplotlib magic fig = plt.figure(num=None, figsize=(18, 9), dpi=80, facecolor='w', edgecolor='k') ax = fig.add_subplot(111) ax1 = fig.add_subplot(231) ax2 = fig.add_subplot(232) ax3 = fig.add_subplot(233) ax4 = fig.add_subplot(234) ax5 = fig.add_subplot(235) plt.subplots_adjust(wspace=0.6, hspace=0.6) ax.spines['top'].set_color('none') ax.spines['bottom'].set_color('none') ax.spines['left'].set_color('none') ax.spines['right'].set_color('none') ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off') ax.set_ylabel('OHLC time (s)') ax.set_xlabel('Compression level') plt.suptitle('OHLC/Compression') #Y value for red line Y = (float(blosclz[4][0]) + float(lz4[4][0]) + float(lz4hc[4][0]) + float(snappy[4][0]) + float(zlib[4][0]))/len(cmethods) Ym = (float(blosclzm[4][0]) + float(lz4m[4][0]) + float(lz4hcm[4][0]) + float(snappym[4][0]) + float(zlibm[4][0]))/len(cmethods) #blosclz ax1.set_title('blosclz') ax1.plot(blosclzm[0][1:], blosclzm[4][1:], 'bo-') ax1.axhline(Ym, color = 'b') ax1.plot(blosclz[0][1:], blosclz[4][1:], 'ro-') ax1.axhline(Y, color = 'r') ax1.set_ylim(ymin=2, ymax=4.5) #lz4 ax2.set_title('lz4') ax2.plot(lz4m[0][1:], lz4m[4][1:], 'bo-') ax2.axhline(Ym, color = 'b') ax2.plot(lz4[0][1:], lz4[4][1:], 'ro-') ax2.axhline(Y, color = 'r') ax2.set_ylim(ymin=2, ymax=4.5) #lz4hc ax3.set_title('lz4hc') ax3.plot(lz4hcm[0][1:], lz4hcm[4][1:], 'bo-') ax3.axhline(Ym, color = 'b') ax3.plot(lz4hc[0][1:], lz4hc[4][1:], 'ro-') ax3.axhline(Y, color = 'r') ax3.set_ylim(ymin=2, ymax=4.5) #snappy ax4.set_title('snappy') ax4.plot(snappym[0][1:], snappym[4][1:], 'bo-') ax4.axhline(Ym, color = 'b') ax4.plot(snappy[0][1:], snappy[4][1:], 'ro-') ax4.axhline(Y, color = 'r') ax4.set_ylim(ymin=2, ymax=4.5) #zlib ax5.set_title('zlib') ax5.plot(zlibm[0][1:], zlibm[4][1:], 'bo-', label='Memory') ax5.axhline(Ym, color = 'b') ax5.plot(zlib[0][1:], zlib[4][1:], 'ro-', label='Disk') ax5.axhline(Y, color = 'r') ax5.set_ylim(ymin=2, ymax=4.5) #Legend ax5.legend(bbox_to_anchor=(2, 1)) plt.show() #Matplotlib magic fig = plt.figure(num=None, figsize=(18, 9), dpi=80, facecolor='w', edgecolor='k') ax = fig.add_subplot(111) ax1 = fig.add_subplot(231) ax2 = fig.add_subplot(232) ax3 = fig.add_subplot(233) ax4 = fig.add_subplot(234) ax5 = fig.add_subplot(235) plt.subplots_adjust(wspace=0.6, hspace=0.6) ax.spines['top'].set_color('none') ax.spines['bottom'].set_color('none') ax.spines['left'].set_color('none') ax.spines['right'].set_color('none') ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off') ax.set_ylabel('MiB/s') ax.set_xlabel('Compression level') #blosclz ax1.set_title('blosclz') ax1.plot(blosclzm[0][1:], blosclzm[5][1:], 'bo-') ax1.plot(blosclzm[0][1:], blosclzm[6][1:], 'go-') ax1.plot(blosclz[0][1:], blosclz[5][1:], 'ro-') ax1.plot(blosclz[0][1:], blosclz[6][1:], 'mo-') #lz4 ax2.set_title('lz4') ax2.plot(lz4m[0][1:], lz4m[5][1:], 'bo-') ax2.plot(lz4m[0][1:], lz4m[6][1:], 'go-') ax2.plot(lz4[0][1:], lz4[5][1:], 'ro-') ax2.plot(lz4[0][1:], lz4[6][1:], 'mo-') #lz4hc ax3.set_title('lz4hc') ax3.plot(lz4hcm[0][1:], lz4hcm[5][1:], 'bo-') ax3.plot(lz4hcm[0][1:], lz4hcm[6][1:], 'go-') ax3.plot(lz4hc[0][1:], lz4hc[5][1:], 'ro-') ax3.plot(lz4hc[0][1:], lz4hc[6][1:], 'mo-') #snappy ax4.set_title('snappy') ax4.plot(snappym[0][1:], snappym[5][1:], 'bo-') ax4.plot(snappym[0][1:], snappym[6][1:], 'go-') ax4.plot(snappy[0][1:], snappy[5][1:], 'ro-') ax4.plot(snappy[0][1:], snappy[6][1:], 'mo-') #zlib ax5.set_title('zlib') ax5.plot(zlibm[0][1:], zlibm[5][1:], 'bo-', label='Compression speed (Memory)') ax5.plot(zlibm[0][1:], zlibm[6][1:], 'go-', label='Computational speed (Memory)') ax5.plot(zlib[0][1:], zlib[5][1:], 'ro-', label='Compression speed (Disk)') ax5.plot(zlib[0][1:], zlib[6][1:], 'mo-', label='Computational speed (Disk)') #Legend ax5.legend(bbox_to_anchor=(2.7, 1)) plt.show()