# Gaussian Blur¶

The purpose of this code is to perform a Gaussian blur operation using both the GPU and the CPU. We compare the performance of each method using the system timer.

In [1]:
import PIL
import PIL.Image

image                            = PIL.Image.open("CinqueTerre.jpg")
image_array_rgb                  = numpy.array(image)

r_original,g_original,b_original = numpy.split(image_array_rgb, 3, axis=2)
a_original                       = numpy.ones_like(r_original) * 255

rgba_original                    = numpy.concatenate((r_original,g_original,b_original,a_original), axis=2).copy()

figsize(6,4)

matplotlib.pyplot.imshow(rgba_original);
matplotlib.pyplot.title("rgba_original");


## isolate small patch from the original image¶

In [2]:
r    = r_original[1150:1450,2300:2600,0].copy().astype(numpy.uint8)
g    = g_original[1150:1450,2300:2600,0].copy().astype(numpy.uint8)
b    = b_original[1150:1450,2300:2600,0].copy().astype(numpy.uint8)
a    = numpy.ones_like(r) * 255
rgba = numpy.dstack((r,g,b,a)).copy()

figsize(4,4)

matplotlib.pyplot.imshow(rgba);
matplotlib.pyplot.title("rgba");


## common gaussian blur code¶

We begin by defining the non-normalized Gaussian Function $G(x,y)$ as follows:

### $$G(x,y) = e^{ - \frac{ x^2 + y^2 }{ 2 { \sigma }^2 } }$$¶

We then define the normalized Gaussian Function as $N(x,y)=cG(x,y)$, where $c$ is a normalization constant:

### $$c = \frac{ 1 }{ \int{ \int{ G(x,y) } } \mathrm{d}x \mathrm{d}y }$$¶

To perform a Gaussian Blur, we use $N(x,y)$ as a convolution kernel.

In [4]:
import scipy

gaussian_blur_kernel_width      = numpy.int32(9)
gaussian_blur_kernel_half_width = numpy.int32(4)
gaussian_blur_sigma             = numpy.float32(2)

y, x = \
scipy.mgrid[-gaussian_blur_kernel_half_width:gaussian_blur_kernel_half_width+1,
-gaussian_blur_kernel_half_width:gaussian_blur_kernel_half_width+1]

gaussian_blur_kernel_not_normalized = numpy.exp( ( - ( x**2 + y**2 ) ) / ( 2 * gaussian_blur_sigma**2 ) )
normalization_constant              = numpy.float32(1) / numpy.sum(gaussian_blur_kernel_not_normalized)
gaussian_blur_kernel                = (normalization_constant * gaussian_blur_kernel_not_normalized).astype(numpy.float32)

matplotlib.pyplot.imshow(gaussian_blur_kernel, cmap="gray", interpolation="nearest");
matplotlib.pyplot.title("gaussian_blur_kernel");
matplotlib.pyplot.colorbar();


## CPU gaussian blur¶

In [5]:
import scipy.ndimage
import scipy.ndimage.filters

convolution_filter        = gaussian_blur_kernel
r_blurred_cpu_gaussian    = scipy.ndimage.filters.convolve(r, convolution_filter, mode="nearest")
g_blurred_cpu_gaussian    = scipy.ndimage.filters.convolve(g, convolution_filter, mode="nearest")
b_blurred_cpu_gaussian    = scipy.ndimage.filters.convolve(b, convolution_filter, mode="nearest")
rgba_blurred_cpu_gaussian = numpy.dstack((r_blurred_cpu_gaussian,g_blurred_cpu_gaussian,b_blurred_cpu_gaussian,a)).copy()

figsize(4,4)

matplotlib.pyplot.imshow(rgba_blurred_cpu_gaussian);
matplotlib.pyplot.title("rgba_blurred_cpu_gaussian");


## naive gaussian blur on the GPU¶

In [6]:
import pycuda.autoinit
import pycuda.driver
import pycuda.compiler

r_blurred_gpu_naive_gaussian = numpy.zeros_like(r)
g_blurred_gpu_naive_gaussian = numpy.zeros_like(g)
b_blurred_gpu_naive_gaussian = numpy.zeros_like(b)

source_module = pycuda.compiler.SourceModule \
(
"""
__global__ void naive_blur(
unsigned char* d_blurred,
unsigned char* d_original,
float*         d_blur_kernel,
int            num_pixels_y,
int            num_pixels_x,
int            blur_kernel_half_width,
int            blur_kernel_width )
{
int  ny             = num_pixels_y;
int  nx             = num_pixels_x;
int2 image_index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
int  image_index_1d = ( nx * image_index_2d.y ) + image_index_2d.x;

if ( image_index_2d.x < nx && image_index_2d.y < ny )
{
float result = 0;

for ( int y = -blur_kernel_half_width; y <= blur_kernel_half_width; y++ )
{
for ( int x = -blur_kernel_half_width; x <= blur_kernel_half_width; x++ )
{
int2 image_offset_index_2d         = make_int2( image_index_2d.x + x, image_index_2d.y + y );
int2 image_offset_index_2d_clamped =
make_int2( min( nx - 1, max( 0, image_offset_index_2d.x ) ), min( ny - 1, max( 0, image_offset_index_2d.y ) ) );
int  image_offset_index_1d_clamped = ( nx * image_offset_index_2d_clamped.y ) + image_offset_index_2d_clamped.x;

unsigned char image_offset_value   = d_original[ image_offset_index_1d_clamped ];

int2 blur_kernel_index_2d          = make_int2( x + blur_kernel_half_width, y + blur_kernel_half_width );
int  blur_kernel_index_1d          = ( blur_kernel_width * blur_kernel_index_2d.y ) + blur_kernel_index_2d.x;

float blur_kernel_value            = d_blur_kernel[ blur_kernel_index_1d ];

result += blur_kernel_value * image_offset_value;
}
}

d_blurred[ image_index_1d ] = (unsigned char)result;
}
}
"""
)

naive_blur_function       = source_module.get_function("naive_blur")

num_pixels_y              = numpy.int32(r.shape[0])
num_pixels_x              = numpy.int32(r.shape[1])
naive_blur_function_block = (32,8,1)

num_blocks_y              = int(ceil(float(num_pixels_y) / float(naive_blur_function_block[1])))
num_blocks_x              = int(ceil(float(num_pixels_x) / float(naive_blur_function_block[0])))
naive_blur_function_grid  = (num_blocks_x, num_blocks_y)

r_device                  = pycuda.driver.mem_alloc(r.nbytes)
g_device                  = pycuda.driver.mem_alloc(g.nbytes)
b_device                  = pycuda.driver.mem_alloc(b.nbytes)

r_blurred_device          = pycuda.driver.mem_alloc(r_blurred_gpu_naive_gaussian.nbytes)
g_blurred_device          = pycuda.driver.mem_alloc(g_blurred_gpu_naive_gaussian.nbytes)
b_blurred_device          = pycuda.driver.mem_alloc(b_blurred_gpu_naive_gaussian.nbytes)

blur_kernel_device        = pycuda.driver.mem_alloc(gaussian_blur_kernel.nbytes)

pycuda.driver.memcpy_htod(r_device,           r)
pycuda.driver.memcpy_htod(g_device,           g)
pycuda.driver.memcpy_htod(b_device,           b)

pycuda.driver.memcpy_htod(r_blurred_device,   r_blurred_gpu_naive_gaussian)
pycuda.driver.memcpy_htod(g_blurred_device,   g_blurred_gpu_naive_gaussian)
pycuda.driver.memcpy_htod(b_blurred_device,   b_blurred_gpu_naive_gaussian)

pycuda.driver.memcpy_htod(blur_kernel_device, gaussian_blur_kernel)

naive_blur_function(
r_blurred_device,
r_device,
blur_kernel_device,
num_pixels_y,
num_pixels_x,
gaussian_blur_kernel_half_width,
gaussian_blur_kernel_width,
block=naive_blur_function_block,
grid=naive_blur_function_grid)

naive_blur_function(
g_blurred_device,
g_device,
blur_kernel_device,
num_pixels_y,
num_pixels_x,
gaussian_blur_kernel_half_width,
gaussian_blur_kernel_width,
block=naive_blur_function_block,
grid=naive_blur_function_grid)

naive_blur_function(
b_blurred_device,
b_device,
blur_kernel_device,
num_pixels_y,
num_pixels_x,
gaussian_blur_kernel_half_width,
gaussian_blur_kernel_width,
block=naive_blur_function_block,
grid=naive_blur_function_grid)

pycuda.driver.memcpy_dtoh(r_blurred_gpu_naive_gaussian, r_blurred_device)
pycuda.driver.memcpy_dtoh(g_blurred_gpu_naive_gaussian, g_blurred_device)
pycuda.driver.memcpy_dtoh(b_blurred_gpu_naive_gaussian, b_blurred_device)

rgba_blurred_gpu_naive_gaussian = \
numpy.dstack((r_blurred_gpu_naive_gaussian,g_blurred_gpu_naive_gaussian,b_blurred_gpu_naive_gaussian,a)).copy()

figsize(4,4)

matplotlib.pyplot.imshow(rgba_blurred_gpu_naive_gaussian);
matplotlib.pyplot.title("rgba_blurred_gpu_naive_gaussian");


## shared memory gaussian blur on the GPU¶

In [11]:
import pycuda.autoinit
import pycuda.driver
import pycuda.compiler

r_blurred_gpu_shared_memory_gaussian = numpy.zeros_like(r)
g_blurred_gpu_shared_memory_gaussian = numpy.zeros_like(g)
b_blurred_gpu_shared_memory_gaussian = numpy.zeros_like(b)

source_module = pycuda.compiler.SourceModule \
(
"""
#define BLOCK_SIZE_Y           8
#define BLOCK_SIZE_X           32
#define BLUR_KERNEL_HALF_WIDTH 4
#define SHARED_MEMORY_SIZE_Y   BLOCK_SIZE_Y + ( 2 * BLUR_KERNEL_HALF_WIDTH )
#define SHARED_MEMORY_SIZE_X   BLOCK_SIZE_X + ( 2 * BLUR_KERNEL_HALF_WIDTH ) + 1
#define SHARED_MEMORY_OFFSET_Y BLUR_KERNEL_HALF_WIDTH
#define SHARED_MEMORY_OFFSET_X BLUR_KERNEL_HALF_WIDTH

__global__ void shared_memory_blur(
unsigned char* d_blurred,
unsigned char* d_original,
float*         d_blur_kernel,
int            num_pixels_y,
int            num_pixels_x,
int            blur_kernel_half_width,
int            blur_kernel_width )
{
__shared__ unsigned char s_original[ SHARED_MEMORY_SIZE_Y ][ SHARED_MEMORY_SIZE_X ];

int  ny                            = num_pixels_y;
int  nx                            = num_pixels_x;
int2 image_index_2d_global         =
make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
int2 image_index_2d_global_clamped =
make_int2( min( nx - 1, max( 0, image_index_2d_global.x ) ), min( ny - 1, max( 0, image_index_2d_global.y ) ) );
int  image_index_1d_global_clamped = ( nx * image_index_2d_global_clamped.y ) + image_index_2d_global_clamped.x;

//
// load center of shared memory
//
s_original[ image_index_2d_shared_memory.y ][ image_index_2d_shared_memory.x ] = d_original[ image_index_1d_global_clamped ];

//
// load y+1 halo into shared memory
//
if ( threadIdx.y < BLUR_KERNEL_HALF_WIDTH )
{
int2 image_halo_index_2d_global         =
make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( ( blockIdx.y + 1 ) * blockDim.y ) + threadIdx.y );
int2 image_halo_index_2d_global_clamped =
make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
int  image_halo_index_1d_global_clamped =
( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
int2 image_halo_index_2d_shared_memory  =

s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
d_original[ image_halo_index_1d_global_clamped ];
}

//
// load y-1 halo into shared memory
//
if ( threadIdx.y >= BLOCK_SIZE_Y - BLUR_KERNEL_HALF_WIDTH )
{
int2 image_halo_index_2d_global         =
make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( ( blockIdx.y - 1 ) * blockDim.y ) + threadIdx.y );
int2 image_halo_index_2d_global_clamped =
make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
int  image_halo_index_1d_global_clamped =
( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
int2 image_halo_index_2d_shared_memory  =

s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
d_original[ image_halo_index_1d_global_clamped ];
}

//
// load x+1 halo into shared memory
//
if ( threadIdx.x < BLUR_KERNEL_HALF_WIDTH )
{
int2 image_halo_index_2d_global         =
make_int2( ( ( blockIdx.x + 1 ) * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
int2 image_halo_index_2d_global_clamped =
make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
int  image_halo_index_1d_global_clamped =
( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
int2 image_halo_index_2d_shared_memory  =

s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
d_original[ image_halo_index_1d_global_clamped ];
}

//
// load x-1 halo into shared memory
//
if ( threadIdx.x >= BLOCK_SIZE_X - BLUR_KERNEL_HALF_WIDTH )
{
int2 image_halo_index_2d_global         =
make_int2( ( ( blockIdx.x - 1 ) * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
int2 image_halo_index_2d_global_clamped =
make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
int  image_halo_index_1d_global_clamped =
( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
int2 image_halo_index_2d_shared_memory =

s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
d_original[ image_halo_index_1d_global_clamped ];
}

//
// load x+1,y+1 halo into shared memory
//
{
int2 image_halo_index_2d_global         =
make_int2( ( ( blockIdx.x + 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y + 1 ) * blockDim.y ) + threadIdx.y );
int2 image_halo_index_2d_global_clamped =
make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
int  image_halo_index_1d_global_clamped =
( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
int2 image_halo_index_2d_shared_memory  =
make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X + BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y + BLOCK_SIZE_Y );

s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
d_original[ image_halo_index_1d_global_clamped ];
}

//
// load x+1,y-1 halo into shared memory
//
if ( threadIdx.x < BLUR_KERNEL_HALF_WIDTH && threadIdx.y >= BLOCK_SIZE_Y - BLUR_KERNEL_HALF_WIDTH )
{
int2 image_halo_index_2d_global         =
make_int2( ( ( blockIdx.x + 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y - 1 ) * blockDim.y ) + threadIdx.y );
int2 image_halo_index_2d_global_clamped =
make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
int  image_halo_index_1d_global_clamped =
( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
int2 image_halo_index_2d_shared_memory  =
make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X + BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y - BLOCK_SIZE_Y );

s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
d_original[ image_halo_index_1d_global_clamped ];
}

//
// load x-1,y+1 halo into shared memory
//
if ( threadIdx.x >= BLOCK_SIZE_X - BLUR_KERNEL_HALF_WIDTH && threadIdx.y < BLUR_KERNEL_HALF_WIDTH )
{
int2 image_halo_index_2d_global         =
make_int2( ( ( blockIdx.x - 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y + 1 ) * blockDim.y ) + threadIdx.y );
int2 image_halo_index_2d_global_clamped =
make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
int  image_halo_index_1d_global_clamped =
( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
int2 image_halo_index_2d_shared_memory  =
make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X - BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y + BLOCK_SIZE_Y );

s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
d_original[ image_halo_index_1d_global_clamped ];
}

//
// load x-1,y-1 halo into shared memory
//
if ( threadIdx.x >= BLOCK_SIZE_X - BLUR_KERNEL_HALF_WIDTH && threadIdx.y >= BLOCK_SIZE_Y - BLUR_KERNEL_HALF_WIDTH )
{
int2 image_halo_index_2d_global         =
make_int2( ( ( blockIdx.x - 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y - 1 ) * blockDim.y ) + threadIdx.y );
int2 image_halo_index_2d_global_clamped =
make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
int  image_halo_index_1d_global_clamped =
( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
int2 image_halo_index_2d_shared_memory  =
make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X - BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y - BLOCK_SIZE_Y );

s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
d_original[ image_halo_index_1d_global_clamped ];
}

//
//

//
// perform blur operation by reading image from shared memory
//
if ( image_index_2d_global.x < nx && image_index_2d_global.y < ny )
{
float result = 0;

for ( int y = -blur_kernel_half_width; y <= blur_kernel_half_width; y++ )
{
for ( int x = -blur_kernel_half_width; x <= blur_kernel_half_width; x++ )
{
int2          image_offset_index_2d =
make_int2( image_index_2d_shared_memory.x + x, image_index_2d_shared_memory.y + y );

unsigned char image_offset_value    = s_original[ image_offset_index_2d.y ][ image_offset_index_2d.x ];

int2          blur_kernel_index_2d  = make_int2( x + blur_kernel_half_width, y + blur_kernel_half_width );
int           blur_kernel_index_1d  = ( blur_kernel_width * blur_kernel_index_2d.y ) + blur_kernel_index_2d.x;

float         blur_kernel_value     = d_blur_kernel[ blur_kernel_index_1d ];

result += blur_kernel_value * image_offset_value;
}
}

d_blurred[ image_index_1d_global_clamped ] = (unsigned char)result;
}
}
"""
)

shared_memory_blur_function = source_module.get_function("shared_memory_blur")

num_pixels_y                        = numpy.int32(r.shape[0])
num_pixels_x                        = numpy.int32(r.shape[1])
shared_memory_blur_function_block   = (32,8,1)

num_blocks_y                        = int(ceil(float(num_pixels_y) / float(shared_memory_blur_function_block[1])))
num_blocks_x                        = int(ceil(float(num_pixels_x) / float(shared_memory_blur_function_block[0])))
shared_memory_blur_function_grid    = (num_blocks_x, num_blocks_y)

r_device                  = pycuda.driver.mem_alloc(r.nbytes)
g_device                  = pycuda.driver.mem_alloc(g.nbytes)
b_device                  = pycuda.driver.mem_alloc(b.nbytes)

r_blurred_device          = pycuda.driver.mem_alloc(r_blurred_gpu_shared_memory_gaussian.nbytes)
g_blurred_device          = pycuda.driver.mem_alloc(g_blurred_gpu_shared_memory_gaussian.nbytes)
b_blurred_device          = pycuda.driver.mem_alloc(b_blurred_gpu_shared_memory_gaussian.nbytes)

blur_kernel_device        = pycuda.driver.mem_alloc(gaussian_blur_kernel.nbytes)

pycuda.driver.memcpy_htod(r_device,           r)
pycuda.driver.memcpy_htod(g_device,           g)
pycuda.driver.memcpy_htod(b_device,           b)

pycuda.driver.memcpy_htod(r_blurred_device,   r_blurred_gpu_shared_memory_gaussian)
pycuda.driver.memcpy_htod(g_blurred_device,   g_blurred_gpu_shared_memory_gaussian)
pycuda.driver.memcpy_htod(b_blurred_device,   b_blurred_gpu_shared_memory_gaussian)

pycuda.driver.memcpy_htod(blur_kernel_device, gaussian_blur_kernel)

shared_memory_blur_function(
r_blurred_device,
r_device,
blur_kernel_device,
num_pixels_y,
num_pixels_x,
gaussian_blur_kernel_half_width,
gaussian_blur_kernel_width,
block=shared_memory_blur_function_block,
grid=shared_memory_blur_function_grid)

shared_memory_blur_function(
g_blurred_device,
g_device,
blur_kernel_device,
num_pixels_y,
num_pixels_x,
gaussian_blur_kernel_half_width,
gaussian_blur_kernel_width,
block=shared_memory_blur_function_block,
grid=shared_memory_blur_function_grid)

shared_memory_blur_function(
b_blurred_device,
b_device,
blur_kernel_device,
num_pixels_y,
num_pixels_x,
gaussian_blur_kernel_half_width,
gaussian_blur_kernel_width,
block=shared_memory_blur_function_block,
grid=shared_memory_blur_function_grid)

pycuda.driver.memcpy_dtoh(r_blurred_gpu_shared_memory_gaussian, r_blurred_device)
pycuda.driver.memcpy_dtoh(g_blurred_gpu_shared_memory_gaussian, g_blurred_device)
pycuda.driver.memcpy_dtoh(b_blurred_gpu_shared_memory_gaussian, b_blurred_device)

rgba_blurred_gpu_shared_memory_gaussian = \
numpy.dstack((r_blurred_gpu_shared_memory_gaussian,g_blurred_gpu_shared_memory_gaussian,b_blurred_gpu_shared_memory_gaussian,a)).copy()

figsize(4,4)

matplotlib.pyplot.imshow(rgba_blurred_gpu_shared_memory_gaussian);
matplotlib.pyplot.title("rgba_blurred_gpu_shared_memory_gaussian");


## verify correctness of GPU naive¶

In [12]:
diff = \
numpy.abs(r_blurred_cpu_gaussian.astype(numpy.float32) - r_blurred_gpu_naive_gaussian.astype(numpy.float32)) + \
numpy.abs(g_blurred_cpu_gaussian.astype(numpy.float32) - g_blurred_gpu_naive_gaussian.astype(numpy.float32)) + \
numpy.abs(b_blurred_cpu_gaussian.astype(numpy.float32) - b_blurred_gpu_naive_gaussian.astype(numpy.float32))

max_diff = numpy.ones_like(diff, dtype=numpy.float32) * 255 * 3

print \
"Difference between CPU and GPU naive results as a percentage of the maximum possible difference (should be less than 1%%): %0.2f%%" % \
(100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(max_diff)))
print

figsize(14,4)

matplotlib.pyplot.subplot(131)
matplotlib.pyplot.imshow(rgba_blurred_cpu_gaussian);
matplotlib.pyplot.title("rgba_blurred_cpu_gaussian")

matplotlib.pyplot.subplot(132)
matplotlib.pyplot.imshow(rgba_blurred_gpu_naive_gaussian);
matplotlib.pyplot.title("rgba_blurred_gpu_naive_gaussian")

matplotlib.pyplot.subplot(133)
matplotlib.pyplot.imshow(diff, cmap="gray");
matplotlib.pyplot.title("diff")
matplotlib.pyplot.colorbar();

Difference between CPU and GPU naive results as a percentage of the maximum possible difference (should be less than 1%): 0.01%



## verify correctness of GPU shared memory¶

In [19]:
diff = \
numpy.abs(r_blurred_gpu_naive_gaussian.astype(numpy.float32) - r_blurred_gpu_shared_memory_gaussian.astype(numpy.float32)) + \
numpy.abs(g_blurred_gpu_naive_gaussian.astype(numpy.float32) - g_blurred_gpu_shared_memory_gaussian.astype(numpy.float32)) + \
numpy.abs(b_blurred_gpu_naive_gaussian.astype(numpy.float32) - b_blurred_gpu_shared_memory_gaussian.astype(numpy.float32))

max_diff = numpy.ones_like(diff, dtype=numpy.float32) * 255 * 3

print \
"Difference between GPU naive and GPU shared memory results as a percentage of the maximum possible difference (should be 0%%): %0.2f%%" % \
(100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(max_diff)))
print

figsize(14,4)

matplotlib.pyplot.subplot(131)
matplotlib.pyplot.imshow(rgba_blurred_gpu_naive_gaussian);
matplotlib.pyplot.title("rgba_blurred_gpu_naive_gaussian")

matplotlib.pyplot.subplot(132)
matplotlib.pyplot.imshow(rgba_blurred_gpu_shared_memory_gaussian);
matplotlib.pyplot.title("rgba_blurred_gpu_shared_memory_gaussian")

matplotlib.pyplot.subplot(133)
matplotlib.pyplot.imshow(diff, cmap="gray");
matplotlib.pyplot.title("diff")
matplotlib.pyplot.colorbar();

Difference between GPU naive and GPU shared memory results as a percentage of the maximum possible difference (should be 0%): 0.00%



## common benchmarking code¶

In [14]:
import sys
import time

if sys.platform == "win32":
print "Using time.clock for benchmarking...\n"
system_timer_function = time.clock
else:
print "Using time.time for benchmarking...\n"
system_timer_function = time.time

num_timing_iterations = 100

print "num_timing_iterations = %d" % num_timing_iterations

Using time.clock for benchmarking...

num_timing_iterations = 100


## benchmark CPU code using the system timer¶

In [15]:
total_time_seconds = 0

for i in range(num_timing_iterations):

r_blurred_cpu_gaussian = numpy.zeros_like(r)
g_blurred_cpu_gaussian = numpy.zeros_like(g)
b_blurred_cpu_gaussian = numpy.zeros_like(b)

convolution_filter     = gaussian_blur_kernel

start_time_seconds     = system_timer_function()

r_blurred_cpu_gaussian = scipy.ndimage.filters.convolve(r, convolution_filter, mode="nearest")
g_blurred_cpu_gaussian = scipy.ndimage.filters.convolve(g, convolution_filter, mode="nearest")
b_blurred_cpu_gaussian = scipy.ndimage.filters.convolve(b, convolution_filter, mode="nearest")

end_time_seconds       = system_timer_function()
elapsed_time_seconds   = (end_time_seconds - start_time_seconds)
total_time_seconds     = total_time_seconds + elapsed_time_seconds

rgba_blurred_cpu_gaussian         = numpy.dstack((r_blurred_cpu_gaussian,g_blurred_cpu_gaussian,b_blurred_cpu_gaussian,a)).copy()
average_time_seconds_cpu_gaussian = total_time_seconds / num_timing_iterations

print "Using system timer for benchmarking (see above)..."
print "Average time elapsed executing a uniform blur on the CPU over %d runs: %f s" % (num_timing_iterations,average_time_seconds_cpu_gaussian)

print

figsize(4,4)

matplotlib.pyplot.imshow(rgba_blurred_cpu_gaussian);
matplotlib.pyplot.title("rgba_blurred_cpu_gaussian");

Using system timer for benchmarking (see above)...
Average time elapsed executing a uniform blur on the CPU over 100 runs: 0.163597 s