import PIL import PIL.Image image = PIL.Image.open("CinqueTerre.jpg") image_array_rgb = numpy.array(image) r_original,g_original,b_original = numpy.split(image_array_rgb, 3, axis=2) a_original = numpy.ones_like(r_original) * 255 rgba_original = numpy.concatenate((r_original,g_original,b_original,a_original), axis=2).copy() figsize(6,4) matplotlib.pyplot.imshow(rgba_original); matplotlib.pyplot.title("rgba_original"); r = r_original[1150:1450,2300:2600,0].copy().astype(numpy.uint8) g = g_original[1150:1450,2300:2600,0].copy().astype(numpy.uint8) b = b_original[1150:1450,2300:2600,0].copy().astype(numpy.uint8) a = numpy.ones_like(r) * 255 rgba = numpy.dstack((r,g,b,a)).copy() figsize(4,4) matplotlib.pyplot.imshow(rgba); matplotlib.pyplot.title("rgba"); import scipy gaussian_blur_kernel_width = numpy.int32(9) gaussian_blur_kernel_half_width = numpy.int32(4) gaussian_blur_sigma = numpy.float32(2) y, x = \ scipy.mgrid[-gaussian_blur_kernel_half_width:gaussian_blur_kernel_half_width+1, -gaussian_blur_kernel_half_width:gaussian_blur_kernel_half_width+1] gaussian_blur_kernel_not_normalized = numpy.exp( ( - ( x**2 + y**2 ) ) / ( 2 * gaussian_blur_sigma**2 ) ) normalization_constant = numpy.float32(1) / numpy.sum(gaussian_blur_kernel_not_normalized) gaussian_blur_kernel = (normalization_constant * gaussian_blur_kernel_not_normalized).astype(numpy.float32) matplotlib.pyplot.imshow(gaussian_blur_kernel, cmap="gray", interpolation="nearest"); matplotlib.pyplot.title("gaussian_blur_kernel"); matplotlib.pyplot.colorbar(); import scipy.ndimage import scipy.ndimage.filters convolution_filter = gaussian_blur_kernel r_blurred_cpu_gaussian = scipy.ndimage.filters.convolve(r, convolution_filter, mode="nearest") g_blurred_cpu_gaussian = scipy.ndimage.filters.convolve(g, convolution_filter, mode="nearest") b_blurred_cpu_gaussian = scipy.ndimage.filters.convolve(b, convolution_filter, mode="nearest") rgba_blurred_cpu_gaussian = numpy.dstack((r_blurred_cpu_gaussian,g_blurred_cpu_gaussian,b_blurred_cpu_gaussian,a)).copy() figsize(4,4) matplotlib.pyplot.imshow(rgba_blurred_cpu_gaussian); matplotlib.pyplot.title("rgba_blurred_cpu_gaussian"); import pycuda.autoinit import pycuda.driver import pycuda.compiler r_blurred_gpu_naive_gaussian = numpy.zeros_like(r) g_blurred_gpu_naive_gaussian = numpy.zeros_like(g) b_blurred_gpu_naive_gaussian = numpy.zeros_like(b) source_module = pycuda.compiler.SourceModule \ ( """ __global__ void naive_blur( unsigned char* d_blurred, unsigned char* d_original, float* d_blur_kernel, int num_pixels_y, int num_pixels_x, int blur_kernel_half_width, int blur_kernel_width ) { int ny = num_pixels_y; int nx = num_pixels_x; int2 image_index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y ); int image_index_1d = ( nx * image_index_2d.y ) + image_index_2d.x; if ( image_index_2d.x < nx && image_index_2d.y < ny ) { float result = 0; for ( int y = -blur_kernel_half_width; y <= blur_kernel_half_width; y++ ) { for ( int x = -blur_kernel_half_width; x <= blur_kernel_half_width; x++ ) { int2 image_offset_index_2d = make_int2( image_index_2d.x + x, image_index_2d.y + y ); int2 image_offset_index_2d_clamped = make_int2( min( nx - 1, max( 0, image_offset_index_2d.x ) ), min( ny - 1, max( 0, image_offset_index_2d.y ) ) ); int image_offset_index_1d_clamped = ( nx * image_offset_index_2d_clamped.y ) + image_offset_index_2d_clamped.x; unsigned char image_offset_value = d_original[ image_offset_index_1d_clamped ]; int2 blur_kernel_index_2d = make_int2( x + blur_kernel_half_width, y + blur_kernel_half_width ); int blur_kernel_index_1d = ( blur_kernel_width * blur_kernel_index_2d.y ) + blur_kernel_index_2d.x; float blur_kernel_value = d_blur_kernel[ blur_kernel_index_1d ]; result += blur_kernel_value * image_offset_value; } } d_blurred[ image_index_1d ] = (unsigned char)result; } } """ ) naive_blur_function = source_module.get_function("naive_blur") num_pixels_y = numpy.int32(r.shape[0]) num_pixels_x = numpy.int32(r.shape[1]) naive_blur_function_block = (32,8,1) num_blocks_y = int(ceil(float(num_pixels_y) / float(naive_blur_function_block[1]))) num_blocks_x = int(ceil(float(num_pixels_x) / float(naive_blur_function_block[0]))) naive_blur_function_grid = (num_blocks_x, num_blocks_y) r_device = pycuda.driver.mem_alloc(r.nbytes) g_device = pycuda.driver.mem_alloc(g.nbytes) b_device = pycuda.driver.mem_alloc(b.nbytes) r_blurred_device = pycuda.driver.mem_alloc(r_blurred_gpu_naive_gaussian.nbytes) g_blurred_device = pycuda.driver.mem_alloc(g_blurred_gpu_naive_gaussian.nbytes) b_blurred_device = pycuda.driver.mem_alloc(b_blurred_gpu_naive_gaussian.nbytes) blur_kernel_device = pycuda.driver.mem_alloc(gaussian_blur_kernel.nbytes) pycuda.driver.memcpy_htod(r_device, r) pycuda.driver.memcpy_htod(g_device, g) pycuda.driver.memcpy_htod(b_device, b) pycuda.driver.memcpy_htod(r_blurred_device, r_blurred_gpu_naive_gaussian) pycuda.driver.memcpy_htod(g_blurred_device, g_blurred_gpu_naive_gaussian) pycuda.driver.memcpy_htod(b_blurred_device, b_blurred_gpu_naive_gaussian) pycuda.driver.memcpy_htod(blur_kernel_device, gaussian_blur_kernel) naive_blur_function( r_blurred_device, r_device, blur_kernel_device, num_pixels_y, num_pixels_x, gaussian_blur_kernel_half_width, gaussian_blur_kernel_width, block=naive_blur_function_block, grid=naive_blur_function_grid) naive_blur_function( g_blurred_device, g_device, blur_kernel_device, num_pixels_y, num_pixels_x, gaussian_blur_kernel_half_width, gaussian_blur_kernel_width, block=naive_blur_function_block, grid=naive_blur_function_grid) naive_blur_function( b_blurred_device, b_device, blur_kernel_device, num_pixels_y, num_pixels_x, gaussian_blur_kernel_half_width, gaussian_blur_kernel_width, block=naive_blur_function_block, grid=naive_blur_function_grid) pycuda.driver.memcpy_dtoh(r_blurred_gpu_naive_gaussian, r_blurred_device) pycuda.driver.memcpy_dtoh(g_blurred_gpu_naive_gaussian, g_blurred_device) pycuda.driver.memcpy_dtoh(b_blurred_gpu_naive_gaussian, b_blurred_device) rgba_blurred_gpu_naive_gaussian = \ numpy.dstack((r_blurred_gpu_naive_gaussian,g_blurred_gpu_naive_gaussian,b_blurred_gpu_naive_gaussian,a)).copy() figsize(4,4) matplotlib.pyplot.imshow(rgba_blurred_gpu_naive_gaussian); matplotlib.pyplot.title("rgba_blurred_gpu_naive_gaussian"); import pycuda.autoinit import pycuda.driver import pycuda.compiler r_blurred_gpu_shared_memory_gaussian = numpy.zeros_like(r) g_blurred_gpu_shared_memory_gaussian = numpy.zeros_like(g) b_blurred_gpu_shared_memory_gaussian = numpy.zeros_like(b) source_module = pycuda.compiler.SourceModule \ ( """ #define BLOCK_SIZE_Y 8 #define BLOCK_SIZE_X 32 #define BLUR_KERNEL_HALF_WIDTH 4 #define SHARED_MEMORY_SIZE_Y BLOCK_SIZE_Y + ( 2 * BLUR_KERNEL_HALF_WIDTH ) #define SHARED_MEMORY_SIZE_X BLOCK_SIZE_X + ( 2 * BLUR_KERNEL_HALF_WIDTH ) + 1 #define SHARED_MEMORY_OFFSET_Y BLUR_KERNEL_HALF_WIDTH #define SHARED_MEMORY_OFFSET_X BLUR_KERNEL_HALF_WIDTH __global__ void shared_memory_blur( unsigned char* d_blurred, unsigned char* d_original, float* d_blur_kernel, int num_pixels_y, int num_pixels_x, int blur_kernel_half_width, int blur_kernel_width ) { __shared__ unsigned char s_original[ SHARED_MEMORY_SIZE_Y ][ SHARED_MEMORY_SIZE_X ]; int ny = num_pixels_y; int nx = num_pixels_x; int2 image_index_2d_global = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y ); int2 image_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_index_2d_global.x ) ), min( ny - 1, max( 0, image_index_2d_global.y ) ) ); int image_index_1d_global_clamped = ( nx * image_index_2d_global_clamped.y ) + image_index_2d_global_clamped.x; int2 image_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y ); // // load center of shared memory // s_original[ image_index_2d_shared_memory.y ][ image_index_2d_shared_memory.x ] = d_original[ image_index_1d_global_clamped ]; // // load y+1 halo into shared memory // if ( threadIdx.y < BLUR_KERNEL_HALF_WIDTH ) { int2 image_halo_index_2d_global = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( ( blockIdx.y + 1 ) * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y + BLOCK_SIZE_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // load y-1 halo into shared memory // if ( threadIdx.y >= BLOCK_SIZE_Y - BLUR_KERNEL_HALF_WIDTH ) { int2 image_halo_index_2d_global = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( ( blockIdx.y - 1 ) * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y - BLOCK_SIZE_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // load x+1 halo into shared memory // if ( threadIdx.x < BLUR_KERNEL_HALF_WIDTH ) { int2 image_halo_index_2d_global = make_int2( ( ( blockIdx.x + 1 ) * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X + BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // load x-1 halo into shared memory // if ( threadIdx.x >= BLOCK_SIZE_X - BLUR_KERNEL_HALF_WIDTH ) { int2 image_halo_index_2d_global = make_int2( ( ( blockIdx.x - 1 ) * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X - BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // load x+1,y+1 halo into shared memory // if ( threadIdx.x < BLUR_KERNEL_HALF_WIDTH && threadIdx.y < BLUR_KERNEL_HALF_WIDTH ) { int2 image_halo_index_2d_global = make_int2( ( ( blockIdx.x + 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y + 1 ) * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X + BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y + BLOCK_SIZE_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // load x+1,y-1 halo into shared memory // if ( threadIdx.x < BLUR_KERNEL_HALF_WIDTH && threadIdx.y >= BLOCK_SIZE_Y - BLUR_KERNEL_HALF_WIDTH ) { int2 image_halo_index_2d_global = make_int2( ( ( blockIdx.x + 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y - 1 ) * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X + BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y - BLOCK_SIZE_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // load x-1,y+1 halo into shared memory // if ( threadIdx.x >= BLOCK_SIZE_X - BLUR_KERNEL_HALF_WIDTH && threadIdx.y < BLUR_KERNEL_HALF_WIDTH ) { int2 image_halo_index_2d_global = make_int2( ( ( blockIdx.x - 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y + 1 ) * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X - BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y + BLOCK_SIZE_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // load x-1,y-1 halo into shared memory // if ( threadIdx.x >= BLOCK_SIZE_X - BLUR_KERNEL_HALF_WIDTH && threadIdx.y >= BLOCK_SIZE_Y - BLUR_KERNEL_HALF_WIDTH ) { int2 image_halo_index_2d_global = make_int2( ( ( blockIdx.x - 1 ) * blockDim.x ) + threadIdx.x, ( ( blockIdx.y - 1 ) * blockDim.y ) + threadIdx.y ); int2 image_halo_index_2d_global_clamped = make_int2( min( nx - 1, max( 0, image_halo_index_2d_global.x ) ), min( ny - 1, max( 0, image_halo_index_2d_global.y ) ) ); int image_halo_index_1d_global_clamped = ( nx * image_halo_index_2d_global_clamped.y ) + image_halo_index_2d_global_clamped.x; int2 image_halo_index_2d_shared_memory = make_int2( threadIdx.x + SHARED_MEMORY_OFFSET_X - BLOCK_SIZE_X, threadIdx.y + SHARED_MEMORY_OFFSET_Y - BLOCK_SIZE_Y ); s_original[ image_halo_index_2d_shared_memory.y ][ image_halo_index_2d_shared_memory.x ] = d_original[ image_halo_index_1d_global_clamped ]; } // // wait until all threads in the thread block are finished loading the image chunk into shared memory // __syncthreads(); // // perform blur operation by reading image from shared memory // if ( image_index_2d_global.x < nx && image_index_2d_global.y < ny ) { float result = 0; for ( int y = -blur_kernel_half_width; y <= blur_kernel_half_width; y++ ) { for ( int x = -blur_kernel_half_width; x <= blur_kernel_half_width; x++ ) { int2 image_offset_index_2d = make_int2( image_index_2d_shared_memory.x + x, image_index_2d_shared_memory.y + y ); unsigned char image_offset_value = s_original[ image_offset_index_2d.y ][ image_offset_index_2d.x ]; int2 blur_kernel_index_2d = make_int2( x + blur_kernel_half_width, y + blur_kernel_half_width ); int blur_kernel_index_1d = ( blur_kernel_width * blur_kernel_index_2d.y ) + blur_kernel_index_2d.x; float blur_kernel_value = d_blur_kernel[ blur_kernel_index_1d ]; result += blur_kernel_value * image_offset_value; } } d_blurred[ image_index_1d_global_clamped ] = (unsigned char)result; } } """ ) shared_memory_blur_function = source_module.get_function("shared_memory_blur") num_pixels_y = numpy.int32(r.shape[0]) num_pixels_x = numpy.int32(r.shape[1]) shared_memory_blur_function_block = (32,8,1) num_blocks_y = int(ceil(float(num_pixels_y) / float(shared_memory_blur_function_block[1]))) num_blocks_x = int(ceil(float(num_pixels_x) / float(shared_memory_blur_function_block[0]))) shared_memory_blur_function_grid = (num_blocks_x, num_blocks_y) r_device = pycuda.driver.mem_alloc(r.nbytes) g_device = pycuda.driver.mem_alloc(g.nbytes) b_device = pycuda.driver.mem_alloc(b.nbytes) r_blurred_device = pycuda.driver.mem_alloc(r_blurred_gpu_shared_memory_gaussian.nbytes) g_blurred_device = pycuda.driver.mem_alloc(g_blurred_gpu_shared_memory_gaussian.nbytes) b_blurred_device = pycuda.driver.mem_alloc(b_blurred_gpu_shared_memory_gaussian.nbytes) blur_kernel_device = pycuda.driver.mem_alloc(gaussian_blur_kernel.nbytes) pycuda.driver.memcpy_htod(r_device, r) pycuda.driver.memcpy_htod(g_device, g) pycuda.driver.memcpy_htod(b_device, b) pycuda.driver.memcpy_htod(r_blurred_device, r_blurred_gpu_shared_memory_gaussian) pycuda.driver.memcpy_htod(g_blurred_device, g_blurred_gpu_shared_memory_gaussian) pycuda.driver.memcpy_htod(b_blurred_device, b_blurred_gpu_shared_memory_gaussian) pycuda.driver.memcpy_htod(blur_kernel_device, gaussian_blur_kernel) shared_memory_blur_function( r_blurred_device, r_device, blur_kernel_device, num_pixels_y, num_pixels_x, gaussian_blur_kernel_half_width, gaussian_blur_kernel_width, block=shared_memory_blur_function_block, grid=shared_memory_blur_function_grid) shared_memory_blur_function( g_blurred_device, g_device, blur_kernel_device, num_pixels_y, num_pixels_x, gaussian_blur_kernel_half_width, gaussian_blur_kernel_width, block=shared_memory_blur_function_block, grid=shared_memory_blur_function_grid) shared_memory_blur_function( b_blurred_device, b_device, blur_kernel_device, num_pixels_y, num_pixels_x, gaussian_blur_kernel_half_width, gaussian_blur_kernel_width, block=shared_memory_blur_function_block, grid=shared_memory_blur_function_grid) pycuda.driver.memcpy_dtoh(r_blurred_gpu_shared_memory_gaussian, r_blurred_device) pycuda.driver.memcpy_dtoh(g_blurred_gpu_shared_memory_gaussian, g_blurred_device) pycuda.driver.memcpy_dtoh(b_blurred_gpu_shared_memory_gaussian, b_blurred_device) rgba_blurred_gpu_shared_memory_gaussian = \ numpy.dstack((r_blurred_gpu_shared_memory_gaussian,g_blurred_gpu_shared_memory_gaussian,b_blurred_gpu_shared_memory_gaussian,a)).copy() figsize(4,4) matplotlib.pyplot.imshow(rgba_blurred_gpu_shared_memory_gaussian); matplotlib.pyplot.title("rgba_blurred_gpu_shared_memory_gaussian"); diff = \ numpy.abs(r_blurred_cpu_gaussian.astype(numpy.float32) - r_blurred_gpu_naive_gaussian.astype(numpy.float32)) + \ numpy.abs(g_blurred_cpu_gaussian.astype(numpy.float32) - g_blurred_gpu_naive_gaussian.astype(numpy.float32)) + \ numpy.abs(b_blurred_cpu_gaussian.astype(numpy.float32) - b_blurred_gpu_naive_gaussian.astype(numpy.float32)) max_diff = numpy.ones_like(diff, dtype=numpy.float32) * 255 * 3 print \ "Difference between CPU and GPU naive results as a percentage of the maximum possible difference (should be less than 1%%): %0.2f%%" % \ (100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(max_diff))) print figsize(14,4) matplotlib.pyplot.subplot(131) matplotlib.pyplot.imshow(rgba_blurred_cpu_gaussian); matplotlib.pyplot.title("rgba_blurred_cpu_gaussian") matplotlib.pyplot.subplot(132) matplotlib.pyplot.imshow(rgba_blurred_gpu_naive_gaussian); matplotlib.pyplot.title("rgba_blurred_gpu_naive_gaussian") matplotlib.pyplot.subplot(133) matplotlib.pyplot.imshow(diff, cmap="gray"); matplotlib.pyplot.title("diff") matplotlib.pyplot.colorbar(); diff = \ numpy.abs(r_blurred_gpu_naive_gaussian.astype(numpy.float32) - r_blurred_gpu_shared_memory_gaussian.astype(numpy.float32)) + \ numpy.abs(g_blurred_gpu_naive_gaussian.astype(numpy.float32) - g_blurred_gpu_shared_memory_gaussian.astype(numpy.float32)) + \ numpy.abs(b_blurred_gpu_naive_gaussian.astype(numpy.float32) - b_blurred_gpu_shared_memory_gaussian.astype(numpy.float32)) max_diff = numpy.ones_like(diff, dtype=numpy.float32) * 255 * 3 print \ "Difference between GPU naive and GPU shared memory results as a percentage of the maximum possible difference (should be 0%%): %0.2f%%" % \ (100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(max_diff))) print figsize(14,4) matplotlib.pyplot.subplot(131) matplotlib.pyplot.imshow(rgba_blurred_gpu_naive_gaussian); matplotlib.pyplot.title("rgba_blurred_gpu_naive_gaussian") matplotlib.pyplot.subplot(132) matplotlib.pyplot.imshow(rgba_blurred_gpu_shared_memory_gaussian); matplotlib.pyplot.title("rgba_blurred_gpu_shared_memory_gaussian") matplotlib.pyplot.subplot(133) matplotlib.pyplot.imshow(diff, cmap="gray"); matplotlib.pyplot.title("diff") matplotlib.pyplot.colorbar(); import sys import time if sys.platform == "win32": print "Using time.clock for benchmarking...\n" system_timer_function = time.clock else: print "Using time.time for benchmarking...\n" system_timer_function = time.time num_timing_iterations = 100 print "num_timing_iterations = %d" % num_timing_iterations total_time_seconds = 0 for i in range(num_timing_iterations): r_blurred_cpu_gaussian = numpy.zeros_like(r) g_blurred_cpu_gaussian = numpy.zeros_like(g) b_blurred_cpu_gaussian = numpy.zeros_like(b) convolution_filter = gaussian_blur_kernel start_time_seconds = system_timer_function() r_blurred_cpu_gaussian = scipy.ndimage.filters.convolve(r, convolution_filter, mode="nearest") g_blurred_cpu_gaussian = scipy.ndimage.filters.convolve(g, convolution_filter, mode="nearest") b_blurred_cpu_gaussian = scipy.ndimage.filters.convolve(b, convolution_filter, mode="nearest") end_time_seconds = system_timer_function() elapsed_time_seconds = (end_time_seconds - start_time_seconds) total_time_seconds = total_time_seconds + elapsed_time_seconds rgba_blurred_cpu_gaussian = numpy.dstack((r_blurred_cpu_gaussian,g_blurred_cpu_gaussian,b_blurred_cpu_gaussian,a)).copy() average_time_seconds_cpu_gaussian = total_time_seconds / num_timing_iterations print "Using system timer for benchmarking (see above)..." print "Average time elapsed executing a uniform blur on the CPU over %d runs: %f s" % (num_timing_iterations,average_time_seconds_cpu_gaussian) print figsize(4,4) matplotlib.pyplot.imshow(rgba_blurred_cpu_gaussian); matplotlib.pyplot.title("rgba_blurred_cpu_gaussian"); total_time_seconds = 0 r_blurred_gpu_naive_gaussian = numpy.zeros_like(r) g_blurred_gpu_naive_gaussian = numpy.zeros_like(g) b_blurred_gpu_naive_gaussian = numpy.zeros_like(b) for i in range(num_timing_iterations): pycuda.driver.memcpy_htod(r_device, r) pycuda.driver.memcpy_htod(g_device, g) pycuda.driver.memcpy_htod(b_device, b) pycuda.driver.memcpy_htod(r_blurred_device, r_blurred_gpu_naive_gaussian) pycuda.driver.memcpy_htod(g_blurred_device, g_blurred_gpu_naive_gaussian) pycuda.driver.memcpy_htod(b_blurred_device, b_blurred_gpu_naive_gaussian) pycuda.driver.memcpy_htod(blur_kernel_device, gaussian_blur_kernel) pycuda.driver.Context.synchronize() start_time_seconds = system_timer_function() naive_blur_function( r_blurred_device, r_device, blur_kernel_device, num_pixels_y, num_pixels_x, gaussian_blur_kernel_half_width, gaussian_blur_kernel_width, block=naive_blur_function_block, grid=naive_blur_function_grid) naive_blur_function( g_blurred_device, g_device, blur_kernel_device, num_pixels_y, num_pixels_x, gaussian_blur_kernel_half_width, gaussian_blur_kernel_width, block=naive_blur_function_block, grid=naive_blur_function_grid) naive_blur_function( b_blurred_device, b_device, blur_kernel_device, num_pixels_y, num_pixels_x, gaussian_blur_kernel_half_width, gaussian_blur_kernel_width, block=naive_blur_function_block, grid=naive_blur_function_grid) pycuda.driver.Context.synchronize() end_time_seconds = system_timer_function() elapsed_time_seconds = end_time_seconds - start_time_seconds total_time_seconds = total_time_seconds + elapsed_time_seconds pycuda.driver.memcpy_dtoh(r_blurred_gpu_naive_gaussian, r_blurred_device) pycuda.driver.memcpy_dtoh(g_blurred_gpu_naive_gaussian, g_blurred_device) pycuda.driver.memcpy_dtoh(b_blurred_gpu_naive_gaussian, b_blurred_device) rgba_blurred_gpu_naive_gaussian = numpy.dstack((r_blurred_gpu_naive_gaussian,g_blurred_gpu_naive_gaussian,b_blurred_gpu_naive_gaussian,a)).copy() average_time_seconds_gpu_naive_gaussian = total_time_seconds / num_timing_iterations print "Using system timer for benchmarking (see above)..." print "Average time elapsed executing naive_uniform_blur GPU kernel over %d runs: %f s" % (num_timing_iterations,average_time_seconds_gpu_naive_gaussian) print figsize(4,4) matplotlib.pyplot.imshow(rgba_blurred_gpu_naive_gaussian); matplotlib.pyplot.title("rgba_blurred_gpu_naive_gaussian"); total_time_seconds = 0 for i in range(num_timing_iterations): r_blurred_gpu_shared_memory_gaussian = numpy.zeros_like(r) g_blurred_gpu_shared_memory_gaussian = numpy.zeros_like(g) b_blurred_gpu_shared_memory_gaussian = numpy.zeros_like(b) pycuda.driver.memcpy_htod(r_device, r) pycuda.driver.memcpy_htod(g_device, g) pycuda.driver.memcpy_htod(b_device, b) pycuda.driver.memcpy_htod(r_blurred_device, r_blurred_gpu_shared_memory_gaussian) pycuda.driver.memcpy_htod(g_blurred_device, g_blurred_gpu_shared_memory_gaussian) pycuda.driver.memcpy_htod(b_blurred_device, b_blurred_gpu_shared_memory_gaussian) pycuda.driver.memcpy_htod(blur_kernel_device, gaussian_blur_kernel) pycuda.driver.Context.synchronize() start_time_seconds = system_timer_function() shared_memory_blur_function( r_blurred_device, r_device, blur_kernel_device, num_pixels_y, num_pixels_x, gaussian_blur_kernel_half_width, gaussian_blur_kernel_width, block=shared_memory_blur_function_block, grid=shared_memory_blur_function_grid) shared_memory_blur_function( g_blurred_device, g_device, blur_kernel_device, num_pixels_y, num_pixels_x, gaussian_blur_kernel_half_width, gaussian_blur_kernel_width, block=shared_memory_blur_function_block, grid=shared_memory_blur_function_grid) shared_memory_blur_function( b_blurred_device, b_device, blur_kernel_device, num_pixels_y, num_pixels_x, gaussian_blur_kernel_half_width, gaussian_blur_kernel_width, block=shared_memory_blur_function_block, grid=shared_memory_blur_function_grid) pycuda.driver.Context.synchronize() end_time_seconds = system_timer_function() elapsed_time_seconds = end_time_seconds - start_time_seconds total_time_seconds = total_time_seconds + elapsed_time_seconds pycuda.driver.memcpy_dtoh(r_blurred_gpu_shared_memory_gaussian, r_blurred_device) pycuda.driver.memcpy_dtoh(g_blurred_gpu_shared_memory_gaussian, g_blurred_device) pycuda.driver.memcpy_dtoh(b_blurred_gpu_shared_memory_gaussian, b_blurred_device) rgba_blurred_gpu_shared_memory_gaussian = \ numpy.dstack((r_blurred_gpu_shared_memory_gaussian,g_blurred_gpu_shared_memory_gaussian,b_blurred_gpu_shared_memory_gaussian,a)).copy() average_time_seconds_gpu_shared_memory_gaussian = total_time_seconds / num_timing_iterations print "Using system timer for benchmarking (see above)..." print "Average time elapsed executing color_to_greyscale GPU kernel over %d runs: %f s" % (num_timing_iterations,average_time_seconds_gpu_shared_memory_gaussian) print figsize(4,4) matplotlib.pyplot.imshow(rgba_blurred_gpu_shared_memory_gaussian); matplotlib.pyplot.title("rgba_blurred_gpu_shared_memory_gaussian"); speedup_gpu_naive = average_time_seconds_cpu_gaussian / average_time_seconds_gpu_naive_gaussian speedup_gpu_shared_memory = average_time_seconds_cpu_gaussian / average_time_seconds_gpu_shared_memory_gaussian print "Average CPU time: %f s" % average_time_seconds_cpu_gaussian print "Average GPU (naive) time: %f s" % average_time_seconds_gpu_naive_gaussian print "Average GPU (shared memory) time: %f s" % average_time_seconds_gpu_shared_memory_gaussian print print "GPU (naive) speedup: %f x" % speedup_gpu_naive print "GPU (shared memory) speedup: %f x" % speedup_gpu_shared_memory