Notebook

Bilateral Filter¶

The purpose of this code is to perform a bilateral filter operation using both the GPU and the CPU. We compare the performance of each method using the system timer.

load image¶

In [1]:

import PIL
import PIL.Image

image                            = PIL.Image.open("CinqueTerre.jpg")
image_array_rgb                  = numpy.array(image)

r_original,g_original,b_original = numpy.split(image_array_rgb, 3, axis=2)
a_original                       = numpy.ones_like(r_original) * 255

rgba_original                    = numpy.concatenate((r_original,g_original,b_original,a_original), axis=2).copy()

figsize(6,4)

matplotlib.pyplot.imshow(rgba_original);
matplotlib.pyplot.title("rgba_original");

isolate small patch from the original image¶

In [2]:

r    = r_original[1150:1450,2300:2600,0].copy().astype(numpy.uint8)
g    = g_original[1150:1450,2300:2600,0].copy().astype(numpy.uint8)
b    = b_original[1150:1450,2300:2600,0].copy().astype(numpy.uint8)
a    = numpy.ones_like(r) * 255
rgba = numpy.dstack((r,g,b,a)).copy()



figsize(4,4)

matplotlib.pyplot.imshow(rgba);
matplotlib.pyplot.title("rgba");

common benchmarking code¶

In [3]:

import sys
import time

if sys.platform == "win32":
    print "Using time.clock for benchmarking...\n" 
    system_timer_function = time.clock
else:
    print "Using time.time for benchmarking...\n" 
    system_timer_function = time.time
    
num_timing_iterations = 1
    
    
    
print "num_timing_iterations = %d" % num_timing_iterations

Using time.clock for benchmarking...

num_timing_iterations = 1

common bilateral filter code¶

We use the following expression for the 1D non-normalized Gaussian Function $G(x)$:

$$G(x) = e^{ - \frac{ x^2 }{ 2 { \sigma }^2 } }$$¶

Similarly, we use the following expression for the 2D non-normalized Gaussian Function $G(x,y)$:

$$G(x,y) = e^{ - \frac{ x^2 + y^2 }{ 2 { \sigma }^2 } }$$¶

To perform a bilateral filtering operation, we normalize both of these functions. We use the normalized versions of each function as the range and space kernels in the bilateral filtering algorithm. See [1] for more details.

[1] http://people.csail.mit.edu/sparis/bf_course/course_notes.pdf

In [4]:

import scipy

bilateral_blur_space_kernel_width      = numpy.int32(25)
bilateral_blur_space_kernel_half_width = numpy.int32(12)
bilateral_blur_space_sigma             = numpy.float32(4)

bilateral_blur_range_kernel_width      = numpy.int32(513)
bilateral_blur_range_kernel_half_width = numpy.int32(256)
bilateral_blur_range_sigma             = numpy.float32(30)

#
# set up the space (2D) and range (1D) kernels
#
y_space, x_space = \
    scipy.mgrid[                                                                          \
        -bilateral_blur_space_kernel_half_width:bilateral_blur_space_kernel_half_width+1, \
        -bilateral_blur_space_kernel_half_width:bilateral_blur_space_kernel_half_width+1]

bilateral_blur_space_kernel_not_normalized = numpy.exp( ( - ( x_space**2 + y_space**2 ) ) / ( 2 * bilateral_blur_space_sigma**2 ) )
normalization_constant_space               = numpy.float32(1) / numpy.sum(bilateral_blur_space_kernel_not_normalized)
bilateral_blur_space_kernel                = (normalization_constant_space * bilateral_blur_space_kernel_not_normalized).astype(numpy.float32)

x_range                                    = numpy.array(range(-bilateral_blur_range_kernel_half_width,bilateral_blur_range_kernel_half_width+1))

bilateral_blur_range_kernel_not_normalized = numpy.exp( ( - ( x_range**2 ) ) / ( 2 * bilateral_blur_range_sigma**2 ) )
normalization_constant_range               = numpy.float32(1) / numpy.sum(bilateral_blur_range_kernel_not_normalized)
bilateral_blur_range_kernel                = (normalization_constant_range * bilateral_blur_range_kernel_not_normalized).astype(numpy.float32)



figsize(9,4)

matplotlib.pyplot.subplot(121)
matplotlib.pyplot.imshow(bilateral_blur_space_kernel, cmap="gray", interpolation="nearest");
matplotlib.pyplot.title("bilateral_blur_space_kernel")
matplotlib.pyplot.colorbar();

matplotlib.pyplot.subplot(122)
matplotlib.pyplot.plot(bilateral_blur_range_kernel);
matplotlib.pyplot.title("bilateral_blur_range_kernel")

matplotlib.pyplot.subplots_adjust(wspace=0.8, bottom=0.2, top=0.8)

define CPU bilateral filter¶

In [5]:

import scipy

def bilateral_filter_cpu(
    image,
    bilateral_blur_space_kernel,
    bilateral_blur_range_kernel,
    bilateral_blur_space_kernel_half_width,
    bilateral_blur_range_kernel_half_width ):
    
    #
    # allocate blurred image to return
    #
    blurred_cpu_bilateral = numpy.zeros_like(image)

    #
    # iterate over the input image
    #
    num_pixels_y = image.shape[0]
    num_pixels_x = image.shape[1]
    
    for y in range(num_pixels_y):
        for x in range(num_pixels_x):
        
            #
            # compute non-normalized combined kernel for this pixel
            #
            image_value                                       = image[y,x]
            bilateral_blur_combined_kernel_not_normalized     = numpy.zeros_like(bilateral_blur_space_kernel)
            bilateral_blur_combined_kernel_not_normalized_sum = numpy.float32(0)
    
            for offset_y in range(-bilateral_blur_space_kernel_half_width, bilateral_blur_space_kernel_half_width + 1):
                for offset_x in range(-bilateral_blur_space_kernel_half_width, bilateral_blur_space_kernel_half_width + 1):
                
                    image_offset_index_y                = y + offset_y
                    image_offset_index_x                = x + offset_x
                    image_offset_index_y_clamped        = min(num_pixels_y - 1, max(0, image_offset_index_y))
                    image_offset_index_x_clamped        = min(num_pixels_x - 1, max(0, image_offset_index_x))
    
                    image_offset_value                  = image[image_offset_index_y_clamped,image_offset_index_x_clamped]
        
                    bilateral_blur_space_kernel_index_y = offset_y + bilateral_blur_space_kernel_half_width
                    bilateral_blur_space_kernel_index_x = offset_x + bilateral_blur_space_kernel_half_width
                    bilateral_blur_space_kernel_value   = \
                        bilateral_blur_space_kernel[bilateral_blur_space_kernel_index_y, bilateral_blur_space_kernel_index_x]
    
                    bilateral_blur_range_kernel_index = \
                        abs(numpy.int32(image_value) - numpy.int32(image_offset_value)) + bilateral_blur_range_kernel_half_width
                    bilateral_blur_range_kernel_value = bilateral_blur_range_kernel[bilateral_blur_range_kernel_index]
                    
                    bilateral_blur_combined_kernel_not_normalized_value = \
                        bilateral_blur_space_kernel_value * bilateral_blur_range_kernel_value
                
                    bilateral_blur_combined_kernel_not_normalized[bilateral_blur_space_kernel_index_y,bilateral_blur_space_kernel_index_x] = \
                        bilateral_blur_combined_kernel_not_normalized_value
    
                    bilateral_blur_combined_kernel_not_normalized_sum = \
                        bilateral_blur_combined_kernel_not_normalized_sum + bilateral_blur_combined_kernel_not_normalized_value

            #
            # compute normalization constant for the current combined kernel
            #
            normalization_constant = numpy.float32(1) / bilateral_blur_combined_kernel_not_normalized_sum

            #
            # convolve the current pixel with the current combined kernel and normalize
            #
            result_value = 0

            for offset_y in range(-bilateral_blur_space_kernel_half_width, bilateral_blur_space_kernel_half_width + 1):
                for offset_x in range(-bilateral_blur_space_kernel_half_width, bilateral_blur_space_kernel_half_width + 1):
                    
                    image_offset_index_y         = y + offset_y
                    image_offset_index_x         = x + offset_x
                    image_offset_index_y_clamped = min(num_pixels_y - 1, max(0, image_offset_index_y))
                    image_offset_index_x_clamped = min(num_pixels_x - 1, max(0, image_offset_index_x))
                    
                    image_offset_value           = image[image_offset_index_y_clamped,image_offset_index_x_clamped]

                    bilateral_blur_combined_kernel_index_y = offset_y + bilateral_blur_space_kernel_half_width
                    bilateral_blur_combined_kernel_index_x = offset_x + bilateral_blur_space_kernel_half_width
                    
                    bilateral_blur_combined_kernel_not_normalized_value = \
                        bilateral_blur_combined_kernel_not_normalized[bilateral_blur_combined_kernel_index_y, bilateral_blur_combined_kernel_index_x]
                    
                    result_value = \
                        result_value + (image_offset_value * bilateral_blur_combined_kernel_not_normalized_value * normalization_constant)

            #
            # store the result
            #
            blurred_cpu_bilateral[y,x] = numpy.uint8(result_value)

    return blurred_cpu_bilateral

call CPU bilateral filter¶

In [6]:

total_time_seconds = 0

r_blurred_cpu_bilateral = numpy.zeros_like(r)
g_blurred_cpu_bilateral = numpy.zeros_like(g)
b_blurred_cpu_bilateral = numpy.zeros_like(b)

for i in range(num_timing_iterations):
    
    r_blurred_cpu_gaussian = numpy.zeros_like(r)
    g_blurred_cpu_gaussian = numpy.zeros_like(g)
    b_blurred_cpu_gaussian = numpy.zeros_like(b)

    start_time_seconds     = system_timer_function()

    r_blurred_cpu_bilateral = \
        bilateral_filter_cpu(
            r,
            bilateral_blur_space_kernel,
            bilateral_blur_range_kernel,
            bilateral_blur_space_kernel_half_width,
            bilateral_blur_range_kernel_half_width )
    
    g_blurred_cpu_bilateral = \
        bilateral_filter_cpu(
            g,
            bilateral_blur_space_kernel,
            bilateral_blur_range_kernel,
            bilateral_blur_space_kernel_half_width,
            bilateral_blur_range_kernel_half_width )
    
    b_blurred_cpu_bilateral = \
        bilateral_filter_cpu(
            b,
            bilateral_blur_space_kernel,
            bilateral_blur_range_kernel,
            bilateral_blur_space_kernel_half_width,
            bilateral_blur_range_kernel_half_width )
    
    end_time_seconds       = system_timer_function()
    elapsed_time_seconds   = (end_time_seconds - start_time_seconds)
    total_time_seconds     = total_time_seconds + elapsed_time_seconds

rgba_blurred_cpu_bilateral         = numpy.dstack((r_blurred_cpu_bilateral,g_blurred_cpu_bilateral,b_blurred_cpu_bilateral,a)).copy()
average_time_seconds_cpu_bilateral = total_time_seconds / num_timing_iterations



print "Using system timer for benchmarking (see above)..."
print "Average time elapsed performing a bilateral filter operation on the CPU over %d runs: %f s" % \
    (num_timing_iterations,average_time_seconds_cpu_bilateral)

print

figsize(4,4)

matplotlib.pyplot.imshow(rgba_blurred_cpu_bilateral);
matplotlib.pyplot.title("rgba_blurred_cpu_bilateral");

Using system timer for benchmarking (see above)...
Average time elapsed performing a bilateral filter operation on the CPU over 1 runs: 28113.759890 s

allocate GPU memory¶

In [8]:

import pycuda.autoinit
import pycuda.driver
import pycuda.compiler

num_pixels_y                          = numpy.int32(r.shape[0])
num_pixels_x                          = numpy.int32(r.shape[1])

r_blurred_gpu_naive_bilateral         = numpy.zeros_like(r)
g_blurred_gpu_naive_bilateral         = numpy.zeros_like(g)
b_blurred_gpu_naive_bilateral         = numpy.zeros_like(b)

r_blurred_gpu_shared_memory_bilateral = numpy.zeros_like(r)
g_blurred_gpu_shared_memory_bilateral = numpy.zeros_like(g)
b_blurred_gpu_shared_memory_bilateral = numpy.zeros_like(b)

bilateral_blur_combined_kernel = \
    numpy.zeros((bilateral_blur_space_kernel_width,bilateral_blur_space_kernel_width,num_pixels_y,num_pixels_x), dtype=numpy.float32)

r_device                              = pycuda.driver.mem_alloc(r.nbytes)
g_device                              = pycuda.driver.mem_alloc(g.nbytes)
b_device                              = pycuda.driver.mem_alloc(b.nbytes)

r_blurred_device                      = pycuda.driver.mem_alloc(r_blurred_gpu_naive_bilateral.nbytes)
g_blurred_device                      = pycuda.driver.mem_alloc(g_blurred_gpu_naive_bilateral.nbytes)
b_blurred_device                      = pycuda.driver.mem_alloc(b_blurred_gpu_naive_bilateral.nbytes)

blur_space_kernel_device              = pycuda.driver.mem_alloc(bilateral_blur_space_kernel.nbytes)
blur_range_kernel_device              = pycuda.driver.mem_alloc(bilateral_blur_range_kernel.nbytes)
blur_combined_kernel_device           = pycuda.driver.mem_alloc(bilateral_blur_combined_kernel.nbytes)

define GPU naive bilateral filter¶

In [24]:

import pycuda.autoinit
import pycuda.driver
import pycuda.compiler

source_module = pycuda.compiler.SourceModule \
(
"""
__global__ void naive_bilateral_filter(
    unsigned char* d_blurred,
    unsigned char* d_original,
    float*         d_blur_space_kernel,
    float*         d_blur_range_kernel,
    float*         d_blur_combined_kernel,
    int            num_pixels_y,
    int            num_pixels_x,
    int            blur_space_kernel_half_width,
    int            blur_space_kernel_width,
    int            blur_range_kernel_half_width,
    int            blur_range_kernel_width
)
{
    int  knx            = blur_space_kernel_width;
    int  ny             = num_pixels_y;
    int  nx             = num_pixels_x;
    int2 image_index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
    int  image_index_1d = ( nx * image_index_2d.y ) + image_index_2d.x;

    if ( image_index_2d.x < nx && image_index_2d.y < ny )
    {
        //
        // compute non-normalized combined kernel for this pixel
        //
        unsigned char image_value                             = d_original[ image_index_1d ];
        float         blur_combined_kernel_not_normalized_sum = 0;

        for ( int y = -blur_space_kernel_half_width; y <= blur_space_kernel_half_width; y++ )
        {
            for ( int x = -blur_space_kernel_half_width; x <= blur_space_kernel_half_width; x++ )
            {
                int2 image_offset_index_2d         = make_int2( image_index_2d.x + x, image_index_2d.y + y );
                int2 image_offset_index_2d_clamped =
                    make_int2( min( nx - 1, max( 0, image_offset_index_2d.x ) ), min( ny - 1, max( 0, image_offset_index_2d.y ) ) );
                int  image_offset_index_1d_clamped = ( nx * image_offset_index_2d_clamped.y ) + image_offset_index_2d_clamped.x;
                
                unsigned char image_offset_value = d_original[ image_offset_index_1d_clamped ];

                int2  blur_space_kernel_index_2d = make_int2( x + blur_space_kernel_half_width, y + blur_space_kernel_half_width );
                int   blur_space_kernel_index_1d = ( knx * blur_space_kernel_index_2d.y ) + blur_space_kernel_index_2d.x;
                int   blur_range_kernel_index_1d = abs( (int)image_value - (int)image_offset_value ) + blur_range_kernel_half_width;

                float blur_space_kernel_value    = d_blur_space_kernel[ blur_space_kernel_index_1d ];
                float blur_range_kernel_value    = d_blur_range_kernel[ blur_range_kernel_index_1d ];

                float blur_combined_kernel_not_normalized_value = blur_space_kernel_value * blur_range_kernel_value;

                int blur_combined_kernel_index_1d =
                    ( knx * ny * nx * blur_space_kernel_index_2d.y ) +
                    ( ny  * nx      * blur_space_kernel_index_2d.x ) +
                    ( nx            * image_index_2d.y ) +
                    ( 1             * image_index_2d.x );

                d_blur_combined_kernel[ blur_combined_kernel_index_1d ] = blur_combined_kernel_not_normalized_value;

                blur_combined_kernel_not_normalized_sum += blur_combined_kernel_not_normalized_value;
            }
        }

        //
        // compute normalization constant for the current combined kernel
        //
        float normalization_constant = 1.0f / blur_combined_kernel_not_normalized_sum;
        
        //
        // convolve the current pixel with the current combined kernel and normalize
        //
        float result = 0;
    
        for ( int y = -blur_space_kernel_half_width; y <= blur_space_kernel_half_width; y++ )
        {
            for ( int x = -blur_space_kernel_half_width; x <= blur_space_kernel_half_width; x++ )
            {
                int2 image_offset_index_2d         = make_int2( image_index_2d.x + x, image_index_2d.y + y );
                int2 image_offset_index_2d_clamped =
                    make_int2( min( nx - 1, max( 0, image_offset_index_2d.x ) ), min( ny - 1, max( 0, image_offset_index_2d.y ) ) );
                int  image_offset_index_1d_clamped = ( nx * image_offset_index_2d_clamped.y ) + image_offset_index_2d_clamped.x;
                
                unsigned char image_offset_value = d_original[ image_offset_index_1d_clamped ];

                int2 blur_space_kernel_index_2d = make_int2( x + blur_space_kernel_half_width, y + blur_space_kernel_half_width );

                int blur_combined_kernel_index_1d =
                    ( knx * ny * nx * blur_space_kernel_index_2d.y ) +
                    ( ny  * nx      * blur_space_kernel_index_2d.x ) +
                    ( nx            * image_index_2d.y ) +
                    ( 1             * image_index_2d.x );

                float blur_combined_kernel_not_normalized_value = d_blur_combined_kernel[ blur_combined_kernel_index_1d ];

                result += normalization_constant * blur_combined_kernel_not_normalized_value * image_offset_value;
            }
        }

        d_blurred[ image_index_1d ] = (unsigned char)result;
    }
}
"""
)

naive_bilateral_filter_function       = source_module.get_function("naive_bilateral_filter")

bilateral_blur_space_kernel_width     = numpy.int32(bilateral_blur_space_kernel.shape[0])
bilateral_blur_range_kernel_width     = numpy.int32(bilateral_blur_range_kernel.shape[0])
                                                    
naive_bilateral_filter_function_block = (16,16,1)

num_blocks_y                          = int(ceil(float(num_pixels_y) / float(naive_bilateral_filter_function_block[1])))
num_blocks_x                          = int(ceil(float(num_pixels_x) / float(naive_bilateral_filter_function_block[0])))
naive_bilateral_filter_function_grid  = (num_blocks_x, num_blocks_y)

call GPU naive bilateral filter¶

In [10]:

total_time_seconds = 0

r_blurred_gpu_naive_bilateral = numpy.zeros_like(r)
g_blurred_gpu_naive_bilateral = numpy.zeros_like(g)
b_blurred_gpu_naive_bilateral = numpy.zeros_like(b)

pycuda.driver.memcpy_htod(r_device,                    r)
pycuda.driver.memcpy_htod(g_device,                    g)
pycuda.driver.memcpy_htod(b_device,                    b)

pycuda.driver.memcpy_htod(r_blurred_device,            r_blurred_gpu_naive_bilateral)
pycuda.driver.memcpy_htod(g_blurred_device,            g_blurred_gpu_naive_bilateral)
pycuda.driver.memcpy_htod(b_blurred_device,            b_blurred_gpu_naive_bilateral)

pycuda.driver.memcpy_htod(blur_space_kernel_device,    bilateral_blur_space_kernel)
pycuda.driver.memcpy_htod(blur_range_kernel_device,    bilateral_blur_range_kernel)
pycuda.driver.memcpy_htod(blur_combined_kernel_device, bilateral_blur_combined_kernel)

for i in range(num_timing_iterations):
    
    pycuda.driver.memcpy_htod(r_blurred_device, r_blurred_gpu_naive_bilateral)
    pycuda.driver.memcpy_htod(g_blurred_device, g_blurred_gpu_naive_bilateral)
    pycuda.driver.memcpy_htod(b_blurred_device, b_blurred_gpu_naive_bilateral)

    pycuda.driver.Context.synchronize()

    start_time_seconds = system_timer_function()

    naive_bilateral_filter_function(
        r_blurred_device,
        r_device,
        blur_space_kernel_device,
        blur_range_kernel_device,
        blur_combined_kernel_device,
        num_pixels_y,
        num_pixels_x,
        bilateral_blur_space_kernel_half_width,
        bilateral_blur_space_kernel_width,
        bilateral_blur_range_kernel_half_width,
        bilateral_blur_range_kernel_width,
        block=naive_bilateral_filter_function_block,
        grid=naive_bilateral_filter_function_grid)

    naive_bilateral_filter_function(
        g_blurred_device,
        g_device,
        blur_space_kernel_device,
        blur_range_kernel_device,
        blur_combined_kernel_device,
        num_pixels_y,
        num_pixels_x,
        bilateral_blur_space_kernel_half_width,
        bilateral_blur_space_kernel_width,
        bilateral_blur_range_kernel_half_width,
        bilateral_blur_range_kernel_width,
        block=naive_bilateral_filter_function_block,
        grid=naive_bilateral_filter_function_grid)

    naive_bilateral_filter_function(
        b_blurred_device,
        b_device,
        blur_space_kernel_device,
        blur_range_kernel_device,
        blur_combined_kernel_device,
        num_pixels_y,
        num_pixels_x,
        bilateral_blur_space_kernel_half_width,
        bilateral_blur_space_kernel_width,
        bilateral_blur_range_kernel_half_width,
        bilateral_blur_range_kernel_width,
        block=naive_bilateral_filter_function_block,
        grid=naive_bilateral_filter_function_grid)
    
    pycuda.driver.Context.synchronize()
    
    end_time_seconds       = system_timer_function()
    elapsed_time_seconds   = (end_time_seconds - start_time_seconds)
    total_time_seconds     = total_time_seconds + elapsed_time_seconds

pycuda.driver.memcpy_dtoh(r_blurred_gpu_naive_bilateral, r_blurred_device)
pycuda.driver.memcpy_dtoh(g_blurred_gpu_naive_bilateral, g_blurred_device)
pycuda.driver.memcpy_dtoh(b_blurred_gpu_naive_bilateral, b_blurred_device)

rgba_blurred_gpu_naive_bilateral = \
    numpy.dstack((r_blurred_gpu_naive_bilateral,g_blurred_gpu_naive_bilateral,b_blurred_gpu_naive_bilateral,a)).copy()
    
average_time_seconds_gpu_naive_bilateral = total_time_seconds / num_timing_iterations



print "Using system timer for benchmarking (see above)..."
print "Average time elapsed performing a bilateral filter operation on the GPU over %d runs: %f s" % \
    (num_timing_iterations,average_time_seconds_gpu_naive_bilateral)

print

figsize(4,4)

matplotlib.pyplot.imshow(rgba_blurred_gpu_naive_bilateral);
matplotlib.pyplot.title("rgba_blurred_gpu_naive_bilateral");

Using system timer for benchmarking (see above)...
Average time elapsed performing a bilateral filter operation on the GPU over 1 runs: 1.760913 s

define GPU shared memory bilateral filter¶

In [11]:

import pycuda.autoinit
import pycuda.driver
import pycuda.compiler

source_module = pycuda.compiler.SourceModule \
(
"""
#define BLOCK_SIZE_Y           16
#define BLOCK_SIZE_X           16
#define BLUR_KERNEL_HALF_WIDTH 12
#define SHARED_MEMORY_SIZE_Y   BLOCK_SIZE_Y + ( 2 * BLUR_KERNEL_HALF_WIDTH )
#define SHARED_MEMORY_SIZE_X   BLOCK_SIZE_X + ( 2 * BLUR_KERNEL_HALF_WIDTH ) + 1
#define SHARED_MEMORY_OFFSET_Y BLUR_KERNEL_HALF_WIDTH
#define SHARED_MEMORY_OFFSET_X BLUR_KERNEL_HALF_WIDTH

__global__ void shared_memory_bilateral_filter(
    unsigned char* d_blurred,
    unsigned char* d_original,
    float*         d_blur_space_kernel,
    float*         d_blur_range_kernel,
    float*         d_blur_combined_kernel,
    int            num_pixels_y,
    int            num_pixels_x,
    int            blur_space_kernel_half_width,
    int            blur_space_kernel_width,
    int            blur_range_kernel_half_width,
    int            blur_range_kernel_width
)
{
    __shared__ unsigned char s_original[ SHARED_MEMORY_SIZE_Y ][ SHARED_MEMORY_SIZE_X ];

    int  ny                            = num_pixels_y;
    int  nx                            = num_pixels_x;
    int  knx                           = blur_space_kernel_width;
    int2 image_index_2d_global         = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
    int2 image_index_2d_global_clamped =
        make_int2( min( nx - 1, max( 0, image_index_2d_global.x ) ), min( ny - 1, max( 0, image_index_2d_global.y ) ) );
    int  image_index_1d_global_clamped = ( nx * image_index_2d_global_clamped.y ) + image_index_2d_global_clamped.x;
    int2 image_index_2d_shared_memory  = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y );

    //
    // load center of shared memory
    //
    s_original[ image_index_2d_shared_memory.y ][ image_index_2d_shared_memory.x ] =
        d_original[ image_index_1d_global_clamped ];
    
    //
    // load y+1 halo into shared memory
    //
    if ( threadIdx.y < BLUR_KERNEL_HALF_WIDTH )
    {
        int2 image_halo_index_2d_global         =
            make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( ( blockIdx.y + 1 ) * blockDim.y ) + threadIdx.y );
        int2 image_halo_index_2d_global_clamped =
            make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
        int  image_halo_index_1d_global_clamped =
            ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
        int2 image_halo_index_2d_shared_memory  =
            make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y + BLOCK_SIZE_Y );

        s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
            d_original[ image_halo_index_1d_global_clamped ];
    }

    //
    // load y-1 halo into shared memory
    //
    if ( threadIdx.y >= BLOCK_SIZE_Y - BLUR_KERNEL_HALF_WIDTH )
    {
        int2 image_halo_index_2d_global         =
            make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( ( blockIdx.y - 1 ) * blockDim.y ) + threadIdx.y );
        int2 image_halo_index_2d_global_clamped =
            make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
        int  image_halo_index_1d_global_clamped =
            ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
        int2 image_halo_index_2d_shared_memory  =
            make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y - BLOCK_SIZE_Y );

        s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
            d_original[ image_halo_index_1d_global_clamped ];
    }

    //
    // load x+1 halo into shared memory
    //
    if ( threadIdx.x < BLUR_KERNEL_HALF_WIDTH )
    {
        int2 image_halo_index_2d_global         =
            make_int2( ( ( blockIdx.x + 1 ) * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
        int2 image_halo_index_2d_global_clamped =
            make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
        int  image_halo_index_1d_global_clamped =
            ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
        int2 image_halo_index_2d_shared_memory  =
            make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X + BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y );

        s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
            d_original[ image_halo_index_1d_global_clamped ];
    }

    //
    // load x-1 halo into shared memory
    //
    if ( threadIdx.x >= BLOCK_SIZE_X - BLUR_KERNEL_HALF_WIDTH )
    {
        int2 image_halo_index_2d_global         =
            make_int2( ( ( blockIdx.x - 1 ) * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
        int2 image_halo_index_2d_global_clamped =
            make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
        int  image_halo_index_1d_global_clamped =
            ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
        int2 image_halo_index_2d_shared_memory =
            make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X - BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y );

        s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
            d_original[ image_halo_index_1d_global_clamped ];
    }

    //
    // load x+1,y+1 halo into shared memory
    //
    if ( threadIdx.x < BLUR_KERNEL_HALF_WIDTH && threadIdx.y < BLUR_KERNEL_HALF_WIDTH )
    {
        int2 image_halo_index_2d_global         =
            make_int2( ( ( blockIdx.x + 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y + 1 ) * blockDim.y ) + threadIdx.y );
        int2 image_halo_index_2d_global_clamped =
            make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
        int  image_halo_index_1d_global_clamped =
            ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
        int2 image_halo_index_2d_shared_memory  =
            make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X + BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y + BLOCK_SIZE_Y );

        s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
            d_original[ image_halo_index_1d_global_clamped ];
    }

    //
    // load x+1,y-1 halo into shared memory
    //
    if ( threadIdx.x < BLUR_KERNEL_HALF_WIDTH && threadIdx.y >= BLOCK_SIZE_Y - BLUR_KERNEL_HALF_WIDTH )
    {
        int2 image_halo_index_2d_global         =
            make_int2( ( ( blockIdx.x + 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y - 1 ) * blockDim.y ) + threadIdx.y );
        int2 image_halo_index_2d_global_clamped =
            make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
        int  image_halo_index_1d_global_clamped =
            ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
        int2 image_halo_index_2d_shared_memory  =
            make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X + BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y - BLOCK_SIZE_Y );

        s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
            d_original[ image_halo_index_1d_global_clamped ];
    }

    //
    // load x-1,y+1 halo into shared memory
    //
    if ( threadIdx.x >= BLOCK_SIZE_X - BLUR_KERNEL_HALF_WIDTH && threadIdx.y < BLUR_KERNEL_HALF_WIDTH )
    {
        int2 image_halo_index_2d_global         =
            make_int2( ( ( blockIdx.x - 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y + 1 ) * blockDim.y ) + threadIdx.y );
        int2 image_halo_index_2d_global_clamped =
            make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
        int  image_halo_index_1d_global_clamped =
            ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
        int2 image_halo_index_2d_shared_memory  =
            make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X - BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y + BLOCK_SIZE_Y );

        s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
            d_original[ image_halo_index_1d_global_clamped ];
    }

    //
    // load x-1,y-1 halo into shared memory
    //
    if ( threadIdx.x >= BLOCK_SIZE_X - BLUR_KERNEL_HALF_WIDTH && threadIdx.y >= BLOCK_SIZE_Y - BLUR_KERNEL_HALF_WIDTH )
    {
        int2 image_halo_index_2d_global         =
            make_int2( ( ( blockIdx.x - 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y - 1 ) * blockDim.y ) + threadIdx.y );
        int2 image_halo_index_2d_global_clamped =
            make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) );
        int  image_halo_index_1d_global_clamped =
            ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x;
        int2 image_halo_index_2d_shared_memory  =
            make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X - BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y - BLOCK_SIZE_Y );

        s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] =
            d_original[ image_halo_index_1d_global_clamped ];
    }

    //
    // wait until all threads in the thread block are finished loading the image chunk into shared memory
    //
    __syncthreads();

    //
    // perform bilateral filter by reading image from shared memory
    //
    if ( image_index_2d_global.x < nx && image_index_2d_global.y < ny )
    {
        //
        // compute non-normalized combined kernel for this pixel
        //
        unsigned char image_value                             =
            s_original[ image_index_2d_shared_memory.y ][ image_index_2d_shared_memory.x ];
        float         blur_combined_kernel_not_normalized_sum = 0;

        for ( int y = -blur_space_kernel_half_width; y <= blur_space_kernel_half_width; y++ )
        {
            for ( int x = -blur_space_kernel_half_width; x <= blur_space_kernel_half_width; x++ )
            {
                int2          image_offset_index_2d      =
                    make_int2( image_index_2d_shared_memory.x + x, image_index_2d_shared_memory.y + y );

                unsigned char image_offset_value         = s_original[ image_offset_index_2d.y ][ image_offset_index_2d.x ];

                int2          blur_space_kernel_index_2d =
                    make_int2( x + blur_space_kernel_half_width, y + blur_space_kernel_half_width );
                int           blur_space_kernel_index_1d =
                    ( knx * blur_space_kernel_index_2d.y ) + blur_space_kernel_index_2d.x;
                int           blur_range_kernel_index_1d =
                    abs( (int)image_value - (int)image_offset_value ) + blur_range_kernel_half_width;

                float         blur_space_kernel_value    = d_blur_space_kernel[ blur_space_kernel_index_1d ];
                float         blur_range_kernel_value    = d_blur_range_kernel[ blur_range_kernel_index_1d ];

                float         blur_combined_kernel_not_normalized_value = blur_space_kernel_value * blur_range_kernel_value;

                int blur_combined_kernel_index_1d =
                    ( knx * ny * nx * blur_space_kernel_index_2d.y ) +
                    ( ny  * nx      * blur_space_kernel_index_2d.x ) +
                    ( nx            * image_index_2d_global.y ) +
                    ( 1             * image_index_2d_global.x );

                d_blur_combined_kernel[ blur_combined_kernel_index_1d ] = blur_combined_kernel_not_normalized_value;

                blur_combined_kernel_not_normalized_sum += blur_combined_kernel_not_normalized_value;
            }
        }

        //
        // compute normalization constant for the current combined kernel
        //
        float normalization_constant = 1.0f / blur_combined_kernel_not_normalized_sum;
        
        //
        // convolve the current pixel with the current combined kernel and normalize
        //
        float result = 0;

        for ( int y = -blur_space_kernel_half_width; y <= blur_space_kernel_half_width; y++ )
        {
            for ( int x = -blur_space_kernel_half_width; x <= blur_space_kernel_half_width; x++ )
            {
                int2          image_offset_index_2d      =
                    make_int2( image_index_2d_shared_memory.x + x, image_index_2d_shared_memory.y + y );

                unsigned char image_offset_value         = s_original[ image_offset_index_2d.y ][ image_offset_index_2d.x ];

                int2          blur_space_kernel_index_2d = make_int2( x + blur_space_kernel_half_width, y + blur_space_kernel_half_width );

                int blur_combined_kernel_index_1d =
                    ( knx * ny * nx * blur_space_kernel_index_2d.y ) +
                    ( ny  * nx      * blur_space_kernel_index_2d.x ) +
                    ( nx            * image_index_2d_global.y ) +
                    ( 1             * image_index_2d_global.x );

                float blur_combined_kernel_not_normalized_value = d_blur_combined_kernel[ blur_combined_kernel_index_1d ];

                result += normalization_constant * blur_combined_kernel_not_normalized_value * image_offset_value;
            }
        }

        d_blurred[ image_index_1d_global_clamped ] = (unsigned char)result;
    }
}
"""
)

shared_memory_bilateral_filter_function       = source_module.get_function("shared_memory_bilateral_filter")

num_pixels_y                                  = numpy.int32(r.shape[0])
num_pixels_x                                  = numpy.int32(r.shape[1])
bilateral_blur_space_kernel_width             = numpy.int32(bilateral_blur_space_kernel.shape[0])
bilateral_blur_range_kernel_width             = numpy.int32(bilateral_blur_range_kernel.shape[0])
                                                    
shared_memory_bilateral_filter_function_block = (16,16,1)

num_blocks_y                                  = int(ceil(float(num_pixels_y) / float(shared_memory_bilateral_filter_function_block[1])))
num_blocks_x                                  = int(ceil(float(num_pixels_x) / float(shared_memory_bilateral_filter_function_block[0])))
shared_memory_bilateral_filter_function_grid  = (num_blocks_x, num_blocks_y)

bilateral_blur_space_kernel_width             = numpy.int32(bilateral_blur_space_kernel.shape[0])
bilateral_blur_range_kernel_width             = numpy.int32(bilateral_blur_range_kernel.shape[0])

call GPU shared memory bilateral filter¶

In [12]:

total_time_seconds = 0

r_blurred_gpu_shared_memory_bilateral = numpy.zeros_like(r)
g_blurred_gpu_shared_memory_bilateral = numpy.zeros_like(g)
b_blurred_gpu_shared_memory_bilateral = numpy.zeros_like(b)

pycuda.driver.memcpy_htod(r_device,                    r)
pycuda.driver.memcpy_htod(g_device,                    g)
pycuda.driver.memcpy_htod(b_device,                    b)

pycuda.driver.memcpy_htod(r_blurred_device,            r_blurred_gpu_shared_memory_bilateral)
pycuda.driver.memcpy_htod(g_blurred_device,            g_blurred_gpu_shared_memory_bilateral)
pycuda.driver.memcpy_htod(b_blurred_device,            b_blurred_gpu_shared_memory_bilateral)

pycuda.driver.memcpy_htod(blur_space_kernel_device,    bilateral_blur_space_kernel)
pycuda.driver.memcpy_htod(blur_range_kernel_device,    bilateral_blur_range_kernel)
pycuda.driver.memcpy_htod(blur_combined_kernel_device, bilateral_blur_combined_kernel)

for i in range(num_timing_iterations):
    
    pycuda.driver.memcpy_htod(r_blurred_device, r_blurred_gpu_shared_memory_bilateral)
    pycuda.driver.memcpy_htod(g_blurred_device, g_blurred_gpu_shared_memory_bilateral)
    pycuda.driver.memcpy_htod(b_blurred_device, b_blurred_gpu_shared_memory_bilateral)

    pycuda.driver.Context.synchronize()
    
    start_time_seconds = system_timer_function()

    shared_memory_bilateral_filter_function(
        r_blurred_device,
        r_device,
        blur_space_kernel_device,
        blur_range_kernel_device,
        blur_combined_kernel_device,
        num_pixels_y,
        num_pixels_x,
        bilateral_blur_space_kernel_half_width,
        bilateral_blur_space_kernel_width,
        bilateral_blur_range_kernel_half_width,
        bilateral_blur_range_kernel_width,
        block=shared_memory_bilateral_filter_function_block,
        grid=shared_memory_bilateral_filter_function_grid)

    shared_memory_bilateral_filter_function(
        g_blurred_device,
        g_device,
        blur_space_kernel_device,
        blur_range_kernel_device,
        blur_combined_kernel_device,
        num_pixels_y,
        num_pixels_x,
        bilateral_blur_space_kernel_half_width,
        bilateral_blur_space_kernel_width,
        bilateral_blur_range_kernel_half_width,
        bilateral_blur_range_kernel_width,
        block=shared_memory_bilateral_filter_function_block,
        grid=shared_memory_bilateral_filter_function_grid)

    shared_memory_bilateral_filter_function(
        b_blurred_device,
        b_device,
        blur_space_kernel_device,
        blur_range_kernel_device,
        blur_combined_kernel_device,
        num_pixels_y,
        num_pixels_x,
        bilateral_blur_space_kernel_half_width,
        bilateral_blur_space_kernel_width,
        bilateral_blur_range_kernel_half_width,
        bilateral_blur_range_kernel_width,
        block=shared_memory_bilateral_filter_function_block,
        grid=shared_memory_bilateral_filter_function_grid)
    
    pycuda.driver.Context.synchronize()
    
    end_time_seconds       = system_timer_function()
    elapsed_time_seconds   = (end_time_seconds - start_time_seconds)
    total_time_seconds     = total_time_seconds + elapsed_time_seconds

pycuda.driver.memcpy_dtoh(r_blurred_gpu_shared_memory_bilateral, r_blurred_device)
pycuda.driver.memcpy_dtoh(g_blurred_gpu_shared_memory_bilateral, g_blurred_device)
pycuda.driver.memcpy_dtoh(b_blurred_gpu_shared_memory_bilateral, b_blurred_device)

rgba_blurred_gpu_shared_memory_bilateral = \
    numpy.dstack((r_blurred_gpu_shared_memory_bilateral,g_blurred_gpu_shared_memory_bilateral,b_blurred_gpu_shared_memory_bilateral,a)).copy()
    
average_time_seconds_gpu_shared_memory_bilateral = total_time_seconds / num_timing_iterations



print "Using system timer for benchmarking (see above)..."
print \
    "Average time elapsed performing a bilateral filter operation on the GPU over %d runs: %f s" % \
    (num_timing_iterations,average_time_seconds_gpu_shared_memory_bilateral)

print

figsize(4,4)

matplotlib.pyplot.imshow(rgba_blurred_gpu_shared_memory_bilateral);
matplotlib.pyplot.title("rgba_blurred_gpu_shared_memory_bilateral");

Using system timer for benchmarking (see above)...
Average time elapsed performing a bilateral filter operation on the GPU over 1 runs: 1.173320 s

verify correctness of GPU naive¶

In [19]:

diff = \
    numpy.abs(r_blurred_cpu_bilateral.astype(numpy.float32) - r_blurred_gpu_naive_bilateral.astype(numpy.float32)) + \
    numpy.abs(g_blurred_cpu_bilateral.astype(numpy.float32) - g_blurred_gpu_naive_bilateral.astype(numpy.float32)) + \
    numpy.abs(b_blurred_cpu_bilateral.astype(numpy.float32) - b_blurred_gpu_naive_bilateral.astype(numpy.float32))

max_diff = numpy.ones_like(diff, dtype=numpy.float32) * 255 * 3



print \
    "Difference between CPU and GPU naive results as a percentage of the maximum possible difference (should be less than 1%%): %0.2f%%" % \
    (100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(max_diff)))

print

figsize(14,4)

matplotlib.pyplot.subplot(131)
matplotlib.pyplot.imshow(rgba_blurred_cpu_bilateral);
matplotlib.pyplot.title("rgba_blurred_cpu_bilateral")

matplotlib.pyplot.subplot(132)
matplotlib.pyplot.imshow(rgba_blurred_gpu_naive_bilateral);
matplotlib.pyplot.title("rgba_blurred_gpu_bilateral")

matplotlib.pyplot.subplot(133)
matplotlib.pyplot.imshow(diff, cmap="gray");
matplotlib.pyplot.title("diff")
matplotlib.pyplot.colorbar();

Difference between CPU and GPU naive results as a percentage of the maximum possible difference (should be less than 1%): 0.00%

verify correctness of GPU shared memory¶

In [18]:

diff = \
    numpy.abs(r_blurred_gpu_naive_bilateral.astype(numpy.float32) - r_blurred_gpu_shared_memory_bilateral.astype(numpy.float32)) + \
    numpy.abs(g_blurred_gpu_naive_bilateral.astype(numpy.float32) - g_blurred_gpu_shared_memory_bilateral.astype(numpy.float32)) + \
    numpy.abs(b_blurred_gpu_naive_bilateral.astype(numpy.float32) - b_blurred_gpu_shared_memory_bilateral.astype(numpy.float32))

max_diff = numpy.ones_like(diff, dtype=numpy.float32) * 255 * 3



print \
    "Difference between CPU and GPU naive results as a percentage of the maximum possible difference (should be less than 1%%): %0.2f%%" % \
    (100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(max_diff)))

print

figsize(14,4)

matplotlib.pyplot.subplot(131)
matplotlib.pyplot.imshow(rgba_blurred_gpu_naive_bilateral);
matplotlib.pyplot.title("rgba_blurred_gpu_naive_bilateral")

matplotlib.pyplot.subplot(132)
matplotlib.pyplot.imshow(rgba_blurred_gpu_shared_memory_bilateral);
matplotlib.pyplot.title("rgba_blurred_gpu_shared_memory_bilateral")

matplotlib.pyplot.subplot(133)
matplotlib.pyplot.imshow(diff, cmap="gray");
matplotlib.pyplot.title("diff")
matplotlib.pyplot.colorbar();

Difference between CPU and GPU naive results as a percentage of the maximum possible difference (should be less than 1%): 0.00%

measure GPU speedup¶

In [15]:

speedup_gpu_naive         = average_time_seconds_cpu_bilateral / average_time_seconds_gpu_naive_bilateral
speedup_gpu_shared_memory = average_time_seconds_cpu_bilateral / average_time_seconds_gpu_shared_memory_bilateral


print "Average CPU time:                 %f s" % average_time_seconds_cpu_bilateral
print "Average GPU (naive) time:         %f s" % average_time_seconds_gpu_naive_bilateral
print "Average GPU (shared memory) time: %f s" % average_time_seconds_gpu_shared_memory_bilateral

print

print "GPU (naive) speedup:              %f x" % speedup_gpu_naive
print "GPU (shared memory) speedup:      %f x" % speedup_gpu_shared_memory

Average CPU time:                 28113.759890 s
Average GPU (naive) time:         1.760913 s
Average GPU (shared memory) time: 1.173320 s

GPU (naive) speedup:              15965.447508 x
GPU (shared memory) speedup:      23960.858629 x