In this tutorial we are going to show that, for some cases, it is faster to have the data compressed due to the disk speed limitations.
To show this, we are going to generate a 30000x20000 mandelbrot's fractal (~12 minutes), store it to a blz without compression and then copy that blz to a new one with different compressors and compression levels. We are then going to downsample the compressed blz to show the times (~30 minutes).
import numba
import numpy as np
import pylab as plt
from time import time
import blz
from shutil import rmtree
import csv
from math import sqrt
from collections import defaultdict
import pandas
The code used here is discussed in the "Generating huge Mandelbrot's fractals" notebook.
@numba.njit
def mandel(x, y, max_iters):
"""
Given the real and imaginary parts of a complex number,
determine if it is a candidate for membership in the Mandelbrot
set given a fixed number of iterations.
"""
c = complex(x, y)
z = 0.0j
for i in xrange(max_iters):
z = z*z + c
if (z.real*z.real + z.imag*z.imag) >= 4:
return i
return max_iters
def create_fractal(height, width, min_x, max_x, min_y, max_y, image, row, iters):
pixel_size_x = (max_x - min_x) / width
pixel_size_y = (max_y - min_y) / height
for x in xrange(height):
imag = min_y + x * pixel_size_y
for y in xrange(width):
real = min_x + y * pixel_size_x
color = mandel(real, imag, iters)
row[y] = color
image.append(row)
height = 20000
width = 30000
#If the blz already exist, remove it
rmtree('images/Mandelbrot.blz', ignore_errors=True)
image = blz.zeros((0, width), rootdir='images/Mandelbrot.blz', dtype=np.uint8,
expectedlen=height*width,
bparams=blz.bparams(clevel=0))
row = np.zeros((width), dtype=np.uint8)
t1 = time()
create_fractal(height, width, -2.0, 1.0, -1.0, 1.0, image, row, 20)
t2 = time()
image.flush()
print t2-t1
697.405315161
Now we have the big fractal in images/Mandelbrot.blz it takes ~577MiB of disk space. Let's do some benchmarks.
This will be the function that will create the new blz given a previous one but with new compression parameters.
def copy(src, dest, clevel=5, shuffle=True, cname="blosclz"):
"""
Parameters
----------
clevel : int (0 <= clevel < 10)
The compression level.
shuffle : bool
Whether the shuffle filter is active or not.
cname : string ('blosclz', 'lz4hc', 'snappy', 'zlib', others?)
Select the compressor to use inside Blosc.
"""
src = blz.barray(rootdir=src)
img_copied = src.copy(rootdir=dest, bparams=blz.bparams(clevel=clevel, shuffle=shuffle, cname=cname), expectedlen=src.size)
img_copied.flush()
This code will be used to downsample all the images, it has been discussed on the notebook "Numba and blz"
@numba.njit
def mymean(src, p0, p1, y):
factor = 1/(1. * p0 * p1)
for i in range(y.shape[0]):
for j in range(y.shape[1]):
s = 0.
for k in range(p0):
for l in range(p1):
s += src[(p0*i)+k, (p1*j)+l] * factor
y[i, j] = s
def downsample(orig, down_cell, cache_size=2**21):
c0, c1 = down_cell
#Let's calculate the matrix dimensions
pixel_size = orig[0, 0].nbytes
n = int(round(sqrt(cache_size/pixel_size), 0))
#How many complete matrices?
hor = int(orig.shape[1]) / n
ver = int(orig.shape[0]) / n
#Complete matrix dimensions
submatrix_n = round(n/float(c0))
submatrix_center_shape = (submatrix_n, submatrix_n)
submatrix_center = np.empty(submatrix_center_shape, dtype=orig.dtype)
#Bottom border matrix dimensions
ver_px = round((int(orig.shape[0]) % n) / c0, 0)
submatrix_bottom_shape = (ver_px, submatrix_n)
submatrix_bottom = np.empty(submatrix_bottom_shape, dtype=orig.dtype)
#Right border matrix dimensions
hor_px = round((int(orig.shape[1]) % n) / c1, 0)
submatrix_right_shape = (submatrix_n, hor_px)
submatrix_right = np.empty(submatrix_right_shape, dtype=orig.dtype)
#Corner matrix dimensions
submatrix_corner_shape = (ver_px, hor_px)
submatrix_corner = np.empty(submatrix_corner_shape, dtype=orig.dtype)
#We build the final container
final_shape = (submatrix_n * ver + ver_px,
submatrix_n * hor + hor_px)
final = np.empty(final_shape, orig.dtype)
#Downsample the middle of the image
for i in xrange(ver):
for j in xrange(hor):
#Get the optimal matrix
submatrix = orig[i*n:(i+1)*n, j*n:(j+1)*n]
mymean(submatrix, c0, c1, submatrix_center)
final[i*submatrix_n:(i+1)*submatrix_n,
j*submatrix_n:(j+1)*submatrix_n] = submatrix_center
#Downsample the right border
for i in range(ver):
submatrix = orig[i*n:(i+1)*n, hor*n:]
mymean(submatrix, c0, c1, submatrix_right)
final[i * submatrix_n:(i+1)*submatrix_n,
submatrix_n*hor:] = submatrix_right
#Downsample the bottom border
for j in range(hor):
submatrix = orig[ver*n:, j*n:(j+1)*n]
mymean(submatrix, c0, c1, submatrix_bottom)
final[submatrix_n*ver:,
j * submatrix_n:(j+1)*submatrix_n] = submatrix_bottom
#Downsample the corner
submatrix = orig[n*ver:, n*hor:]
mymean(submatrix, c0, c1, submatrix_corner)
final[submatrix_n*ver:, submatrix_n*hor:] = submatrix_corner
return final
Now that we have all the ingredients, let's build ourselves a benchmark code. (Note: This code saves the benchmark results to .csv files)
def benchmark(src, cmethods):
for method in cmethods:
myfile = open('csv/' + method + '.csv', 'wb')
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
wr.writerow(['Compression level', 'Compressed size', 'Compression ratio', 'Compression time', 'Downsampling time'])
for compression_level in xrange(0,10):
#I get the original image and compress it
tc1 = time()
copy(src, 'images/temp.blz', compression_level, False, method)
tc2 = time()
#Get disk size
img = blz.barray(rootdir='images/temp.blz')
disk_size = img.cbytes
#Now I downsample it measuring the time
t1 = time()
downsample(img, (4,4))
t2 = time()
#I should store the size of the file, compression method, shuffle and compression ratio
row = [compression_level, disk_size, round(img.nbytes/float(disk_size),3), str(tc2 - tc1), str(t2 - t1)]
#Add it to the csv
wr.writerow(row)
rmtree('images/temp.blz', ignore_errors=True)
myfile.close()
Ok, so we are going to see how well does the downsample function behave when working with compressed blzs.
src = 'images/Mandelbrot.blz'
cmethods = ['blosclz', 'lz4hc', 'snappy', 'zlib']
t1 = time()
benchmark(src, cmethods)
t2 = time()
print t2-t1
1625.42109704
Now we should have 4 csvs with the relevant data. We first convert them to Python dictionaries so we can easily plot them.
def get_dict(filename):
columns = defaultdict(list)
with open('csv/' + filename) as f:
reader = csv.reader(f)
reader.next()
for row in reader:
for (i,v) in enumerate(row):
if i == 1:
columns[i].append(float(v)/131072)
continue
columns[i].append(v)
return columns
blosclz = get_dict('blosclz.csv')
lz4hc = get_dict('lz4hc.csv')
snappy = get_dict('snappy.csv')
zlib = get_dict('zlib.csv')
Let's see the raw data.
print 'blosclz data'
df = pandas.read_csv('csv/blosclz.csv')
df
blosclz data
Compression level | Compressed size | Compression ratio | Compression time | Downsampling time | |
---|---|---|---|---|---|
0 | 0 | 600864672 | 0.999 | 2.782878 | 28.031901 |
1 | 1 | 12165281 | 49.321 | 1.763561 | 22.019383 |
2 | 2 | 11202146 | 53.561 | 1.769053 | 23.470012 |
3 | 3 | 11202146 | 53.561 | 1.822704 | 23.755093 |
4 | 4 | 10853311 | 55.283 | 1.826486 | 23.889582 |
5 | 5 | 10853311 | 55.283 | 1.836225 | 23.851175 |
6 | 6 | 10765857 | 55.732 | 1.852662 | 23.982295 |
7 | 7 | 10805864 | 55.525 | 1.803914 | 24.268449 |
8 | 8 | 10829057 | 55.406 | 1.844621 | 24.067227 |
9 | 9 | 10879027 | 55.152 | 1.889094 | 23.935759 |
10 rows × 5 columns
print 'lz4hc data'
df = pandas.read_csv('csv/lz4hc.csv')
df
lz4hc data
Compression level | Compressed size | Compression ratio | Compression time | Downsampling time | |
---|---|---|---|---|---|
0 | 0 | 600864672 | 0.999 | 2.607177 | 26.384232 |
1 | 1 | 7051804 | 85.085 | 22.025753 | 24.304401 |
2 | 2 | 7051804 | 85.085 | 21.674532 | 25.063633 |
3 | 3 | 7051804 | 85.085 | 21.675338 | 24.217175 |
4 | 4 | 6629752 | 90.501 | 25.386786 | 23.898448 |
5 | 5 | 6629752 | 90.501 | 25.408783 | 23.106808 |
6 | 6 | 6416552 | 93.508 | 37.387300 | 17.693755 |
7 | 7 | 6312160 | 95.055 | 38.977684 | 17.679116 |
8 | 8 | 6312160 | 95.055 | 38.699354 | 17.719355 |
9 | 9 | 6312160 | 95.055 | 38.842180 | 17.736349 |
10 rows × 5 columns
print 'snappy data'
df = pandas.read_csv('csv/snappy.csv')
df
snappy data
Compression level | Compressed size | Compression ratio | Compression time | Downsampling time | |
---|---|---|---|---|---|
0 | 0 | 600864672 | 0.999 | 2.801109 | 29.384442 |
1 | 1 | 598817657 | 1.002 | 2.807479 | 31.367154 |
2 | 2 | 598817657 | 1.002 | 4.694462 | 34.749116 |
3 | 3 | 598817657 | 1.002 | 2.703463 | 31.206180 |
4 | 4 | 598671833 | 1.002 | 2.538312 | 35.297246 |
5 | 5 | 598671833 | 1.002 | 2.621801 | 31.044807 |
6 | 6 | 580402650 | 1.034 | 2.621331 | 32.259069 |
7 | 7 | 543933158 | 1.103 | 2.541712 | 35.558219 |
8 | 8 | 543933158 | 1.103 | 2.808962 | 35.409513 |
9 | 9 | 600864672 | 0.999 | 3.597196 | 33.235315 |
10 rows × 5 columns
print 'zlib data'
df = pandas.read_csv('csv/zlib.csv')
df
zlib data
Compression level | Compressed size | Compression ratio | Compression time | Downsampling time | |
---|---|---|---|---|---|
0 | 0 | 600864672 | 0.999 | 2.696286 | 29.421912 |
1 | 1 | 9521032 | 63.018 | 3.106901 | 34.339349 |
2 | 2 | 9179054 | 65.366 | 3.149718 | 34.597896 |
3 | 3 | 8742380 | 68.631 | 3.139577 | 34.406265 |
4 | 4 | 6430329 | 93.308 | 6.326290 | 40.333203 |
5 | 5 | 6150632 | 97.551 | 6.540398 | 40.457701 |
6 | 6 | 5518118 | 108.733 | 12.883182 | 53.341339 |
7 | 7 | 5166095 | 116.142 | 12.944861 | 53.405852 |
8 | 8 | 4539219 | 132.181 | 17.993135 | 52.008409 |
9 | 9 | 4351807 | 137.874 | 16.323670 | 51.841789 |
10 rows × 5 columns
Now that we have taken a look to the raw data let's plot a little, we may see interesting things.
%matplotlib inline
#Matplotlib magic
fig = plt.figure()
ax = fig.add_subplot(111)
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)
plt.subplots_adjust(wspace=0.6, hspace=0.6)
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_color('none')
ax.spines['left'].set_color('none')
ax.spines['right'].set_color('none')
ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off')
plt.rcParams['figure.figsize'] = 10, 10
ax.set_xlabel('Compression ratio')
ax.set_ylabel('Compression level')
#blosclz
ax1.set_title('blosclz')
ax1.plot(blosclz[2][2:], blosclz[0][2:], 'bo-')
#lz4hc
ax2.set_title('lz4hc')
ax2.plot(lz4hc[2][2:], lz4hc[0][2:], 'bo-')
#snappy
ax3.set_title('snappy')
ax3.plot(snappy[2][2:], snappy[0][2:], 'bo-')
#zlib
ax4.set_title('zlib')
ax4.plot(zlib[2][2:], zlib[0][2:], 'bo-')
plt.show()
As we can see above, we don't always get the best compression ratio with a higher compression level. lz4hc and zlib are the only ones that behave as expected. This is because blosclz and snappy are optimized for text, not images.
#Matplotlib magic
fig = plt.figure()
ax = fig.add_subplot(111)
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)
plt.subplots_adjust(wspace=0.6, hspace=0.6)
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_color('none')
ax.spines['left'].set_color('none')
ax.spines['right'].set_color('none')
ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off')
plt.rcParams['figure.figsize'] = 10, 10
ax.set_ylabel('Compression time (s)')
ax.set_xlabel('Compression ratio')
#blosclz
ax1.set_title('blosclz')
ax1.plot(blosclz[2][2:], blosclz[3][2:], 'bo-')
#lz4hc
ax2.set_title('lz4hc')
ax2.plot(lz4hc[2][2:], lz4hc[3][2:], 'bo-')
#snappy
ax3.set_title('snappy')
ax3.plot(snappy[2][2:], snappy[3][2:], 'bo-')
#zlib
ax4.set_title('zlib')
ax4.plot(zlib[2][2:], zlib[3][2:], 'bo-')
plt.show()
Here we can see that we don't always get the best compress ratio when computing longer times:
#Matplotlib magic
fig = plt.figure()
ax = fig.add_subplot(111)
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)
plt.subplots_adjust(wspace=0.6, hspace=0.6)
ax.spines['top'].set_color('none')
ax.spines['bottom'].set_color('none')
ax.spines['left'].set_color('none')
ax.spines['right'].set_color('none')
ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off')
plt.rcParams['figure.figsize'] = 10, 10
ax.set_ylabel('Downsampling time (s)')
ax.set_xlabel('Compression level')
#Y value for red line
Y = (float(blosclz[4][0]) + float(lz4hc[4][0]) + float(snappy[4][0]) + float(zlib[4][0]))/len(cemethods)
#blosclz
ax1.set_title('blosclz')
ax1.plot(blosclz[0][2:], blosclz[4][2:], 'bo-')
#Uncompressed point
ax1.axhline(Y, color = 'r')
#lz4hc
ax2.set_title('lz4hc')
ax2.plot(lz4hc[0][2:], lz4hc[4][2:], 'bo-')
#Uncompressed point
ax2.axhline(Y, color = 'r')
#snappy
ax3.set_title('snappy')
ax3.plot(snappy[0][2:], snappy[4][2:], 'bo-')
#Uncompressed point
ax3.axhline(Y, color = 'r')
#zlib
ax4.set_title('zlib')
ax4.plot(zlib[0][2:], zlib[4][2:], 'bo-')
#Uncompressed point
ax4.axhline(Y, color = 'r')
plt.show()
The red line represents the downsampling time for the uncompressed blz (mean of 4 tries), so everything lower is faster than the uncompressed data.
Here we can see a few interesting things:
When working with big data (and that was not even that big) not only RAM becomes an issue. With a little more of CPU time you can save a lot of disk space (more than 500 MiB with around 4 seconds of overhead in this example), there are some times when you can even speed up things a little bit when using compression.
Always pick the correct compressor for your type of data, and maybe try to do some tests first.