Motivation and explanation for the code is available at How To Quickly Compute The Mandelbrot Set In Python
Timings depend heavily on the machine and Python version used. The timings below are for a Windows laptop (Lenovo Thinkpad W520) with Anaconda 64 bits and Python 3.5. A more recent machine could be way faster for gpu computing for instance.
Let's import some useful packages first.
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import colors
%matplotlib inline
In order to check code correctness we will display the images.
The display code is taken from How To Make Python Run As Fast As Julia.
We use small image sizes to enable quick check on all code, including the slowest ones.
def mandelbrot_image(xmin,xmax,ymin,ymax,width=3,height=3,maxiter=80,cmap='hot'):
dpi = 72
img_width = dpi * width
img_height = dpi * height
x,y,z = mandelbrot_set(xmin,xmax,ymin,ymax,img_width,img_height,maxiter)
fig, ax = plt.subplots(figsize=(width, height),dpi=72)
ticks = np.arange(0,img_width,3*dpi)
x_ticks = xmin + (xmax-xmin)*ticks/img_width
plt.xticks(ticks, x_ticks)
y_ticks = ymin + (ymax-ymin)*ticks/img_width
plt.yticks(ticks, y_ticks)
norm = colors.PowerNorm(0.3)
ax.imshow(z.T,cmap=cmap,origin='lower',norm=norm)
Let's set a base line, using code from Julia GitHhub
def mandelbrot(z,maxiter):
c = z
for n in range(maxiter):
if abs(z) > 2:
return n
z = z*z + c
return maxiter
def mandelbrot_set(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width)
r2 = np.linspace(ymin, ymax, height)
return (r1,r2,[mandelbrot(complex(r, i),maxiter) for r in r1 for i in r2])
We use two images for the benchmark
%timeit mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
1 loops, best of 3: 11 s per loop
%timeit mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
1 loops, best of 3: 3min 56s per loop
Let's replace lists with Numpy arrays.
def mandelbrot(c,maxiter):
z = c
for n in range(maxiter):
if abs(z) > 2:
return n
z = z*z + c
return 0
def mandelbrot_set(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width)
r2 = np.linspace(ymin, ymax, height)
n3 = np.empty((width,height))
for i in range(width):
for j in range(height):
n3[i,j] = mandelbrot(r1[i] + 1j*r2[j],maxiter)
return (r1,r2,n3)
%timeit mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,10)
1 loops, best of 3: 8.25 s per loop
%timeit mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
1 loops, best of 3: 6min 10s per loop
Let's check these images are correct. We check with a small size image given the code is really slow.
mandelbrot_image(-2.0,0.5,-1.25,1.25,maxiter=80,cmap='gnuplot2')
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')
Compiling code with Numba is easy, we simply need to annotate the functions with @jit
.
from numba import jit
@jit
def mandelbrot(c,maxiter):
z = c
for n in range(maxiter):
if abs(z) > 2:
return n
z = z*z + c
return 0
@jit
def mandelbrot_set(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width)
r2 = np.linspace(ymin, ymax, height)
n3 = np.empty((width,height))
for i in range(width):
for j in range(height):
n3[i,j] = mandelbrot(r1[i] + 1j*r2[j],maxiter)
return (r1,r2,n3)
Timing it shows it is pretty fast
%timeit mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
1 loops, best of 3: 167 ms per loop 1 loops, best of 3: 3.84 s per loop
Checking if it is correct
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
Removing square root computation within abs
@jit
def mandelbrot(c,maxiter):
z = c
for n in range(maxiter):
if z.real * z.real + z.imag * z.imag > 4.0:
return n
z = z*z + c
return 0
@jit
def mandelbrot_set(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width)
r2 = np.linspace(ymin, ymax, height)
n3 = np.empty((width,height))
for i in range(width):
for j in range(height):
n3[i,j] = mandelbrot(r1[i] + 1j*r2[j],maxiter)
return (r1,r2,n3)
It is faster.
%timeit mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
1 loops, best of 3: 115 ms per loop 1 loops, best of 3: 2.74 s per loop
Checking if it is correct
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
Decomposing complex into two floats.
@jit
def mandelbrot(creal,cimag,maxiter):
real = creal
imag = cimag
for n in range(maxiter):
real2 = real*real
imag2 = imag*imag
if real2 + imag2 > 4.0:
return n
imag = 2* real*imag + cimag
real = real2 - imag2 + creal
return 0
@jit
def mandelbrot_set4(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width)
r2 = np.linspace(ymin, ymax, height)
n3 = np.empty((width,height))
for i in range(width):
for j in range(height):
n3[i,j] = mandelbrot(r1[i],r2[j],maxiter)
return (r1,r2,n3)
Slightly faster.
%timeit mandelbrot_set4(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set4(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
1 loops, best of 3: 112 ms per loop 1 loops, best of 3: 2.44 s per loop
mandelbrot_set = mandelbrot_set4
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
@jit
def mandel(x, y, max_iters):
"""
Given the real and imaginary parts of a complex number,
determine if it is a candidate for membership in the Mandelbrot
set given a fixed number of iterations.
"""
c = complex(x,y)
z = 0j
for i in range(max_iters):
z = z*z + c
if z.real * z.real + z.imag * z.imag > 4:
return i
return 0
@jit(nopython=True)
def create_fractal(min_x, max_x, min_y, max_y, width, height, iters):
image = np.empty((width,height))
pixel_size_x = (max_x - min_x) / width
pixel_size_y = (max_y - min_y) / height
for x in range(width):
real = min_x + x * pixel_size_x
for y in range(height):
imag = min_y + y * pixel_size_y
color = mandel(real, imag, iters)
image[y, x] = color
return image
A tad slower than mine.
%timeit create_fractal(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit create_fractal(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
1 loops, best of 3: 120 ms per loop 1 loops, best of 3: 2.88 s per loop
%load_ext cython
%%cython
import cython
import numpy as np
cdef int mandelbrot(double creal, double cimag, int maxiter):
cdef:
double real2, imag2
double real = creal, imag = cimag
int n
for n in range(maxiter):
real2 = real*real
imag2 = imag*imag
if real2 + imag2 > 4.0:
return n
imag = 2* real*imag + cimag
real = real2 - imag2 + creal;
return 0
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef mandelbrot_set(double xmin, double xmax, double ymin, double ymax, int width, int height, int maxiter):
cdef:
double[:] r1 = np.linspace(xmin, xmax, width)
double[:] r2 = np.linspace(ymin, ymax, height)
int[:,:] n3 = np.empty((width,height), np.int)
int i,j
for i in range(width):
for j in range(height):
n3[i,j] = mandelbrot(r1[i], r2[j], maxiter)
return (r1,r2,n3)
Almost as fast as Numba
%timeit mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
10 loops, best of 3: 115 ms per loop
%timeit mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
1 loops, best of 3: 2.8 s per loop
Let's check it
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
def mandelbrot_numpy(q, maxiter):
# calculate z using numpy, this is the original
# routine from vegaseat's URL
output = np.resize(np.array(0,), q.shape)
z = np.zeros(q.shape, np.complex64)
for it in range(maxiter):
z = z*z + q
done = np.greater(abs(z), 2.0)
q = np.where(done, 0+0j, q)
z = np.where(done, 0+0j, z)
output = np.where(done, it, output)
return output
def mandelbrot_set2(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width)
r2 = np.linspace(ymin, ymax, height)
q = np.ravel(r1 + r2[:,None]*1j)
n3 = mandelbrot_numpy(q,maxiter)
n3 = n3.reshape((width,height))
return (r1,r2,n3.T)
%timeit mandelbrot_set2(-2.0,0.5,-1.25,1.25,1000,1000,80)
1 loops, best of 3: 3.28 s per loop
We can check it is correct with the following.
mandelbrot_set = mandelbrot_set2
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
There is a better code from http://www.vallis.org/salon/summary-10.html
def mandelbrot_numpy(c, maxiter):
output = np.zeros(c.shape)
z = np.empty(c.shape, np.complex64)
output[:] = maxiter
for it in range(maxiter):
notdone = (output == maxiter)
z[notdone] = z[notdone]**2 + c[notdone]
output[notdone & (z.real*z.real + z.imag*z.imag > 4.0)] = it
return output
It is indeed way faster.
%timeit mandelbrot_set2(-2.0,0.5,-1.25,1.25,1000,1000,80)
1 loops, best of 3: 1.31 s per loop
The code can be made faster by only performing computation where needed. There is no need to ravel the input array either.
def mandelbrot_numpy(c, maxiter):
output = np.zeros(c.shape)
z = np.zeros(c.shape, np.complex64)
for it in range(maxiter):
notdone = np.less(z.real*z.real + z.imag*z.imag, 4.0)
output[notdone] = it
z[notdone] = z[notdone]**2 + c[notdone]
output[output == maxiter-1] = 0
return output
def mandelbrot_set2(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
c = r1 + r2[:,None]*1j
n3 = mandelbrot_numpy(c,maxiter)
return (r1,r2,n3.T)
It is about 3 times faster than the first Numpy array code.
%timeit mandelbrot_set2(-2.0,0.5,-1.25,1.25,1000,1000,80)
1 loops, best of 3: 1.03 s per loop
%timeit mandelbrot_set2(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
1 loops, best of 3: 29.4 s per loop
We can check it is correct with the following.
mandelbrot_set = mandelbrot_set2
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
We can avoid temporary array creation with NumExpr.
import numexpr as ne
def mandelbrot_numpy(c, maxiter):
output = np.zeros(c.shape)
z = np.zeros(c.shape, np.complex64)
for it in range(maxiter):
notdone = ne.evaluate('z.real*z.real + z.imag*z.imag < 4.0')
output[notdone] = it
z = ne.evaluate('where(notdone,z**2+c,z)')
output[output == maxiter-1] = 0
return output
This is faster.
%timeit mandelbrot_set2(-2.0,0.5,-1.25,1.25,1000,1000,80)
1 loops, best of 3: 749 ms per loop
%timeit mandelbrot_set2(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
1 loops, best of 3: 20.7 s per loop
Check.
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
An alternative to NumExpr.
from numba import vectorize, complex64, boolean, jit
@vectorize([boolean(complex64)])
def f(z):
return (z.real*z.real + z.imag*z.imag) < 4.0
@vectorize([complex64(complex64, complex64)])
def g(z,c):
return z*z + c
@jit
def mandelbrot_numpy(c, maxiter):
output = np.zeros(c.shape, np.int)
z = np.empty(c.shape, np.complex64)
for it in range(maxiter):
notdone = f(z)
output[notdone] = it
z[notdone] = g(z[notdone],c[notdone])
output[output == maxiter-1] = 0
return output
def mandelbrot_set5(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
c = np.empty((r1.shape[0], r2.shape[0]), dtype=np.complex64)
c[:,:] = r1 + r2[:,None]*1j
n3 = mandelbrot_numpy(c,maxiter)
return (r1,r2,n3.T)
Even faster!
%timeit mandelbrot_set5(-2.0,0.5,-1.25,1.25,1000,1000,80)
1 loops, best of 3: 629 ms per loop
%timeit mandelbrot_set5(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
1 loops, best of 3: 17.8 s per loop
Check.
mandelbrot_set = mandelbrot_set5
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
Try parallelism
from numba import vectorize, complex64, boolean
@vectorize([boolean(complex64)], target='parallel')
def f(z):
return (z.real*z.real + z.imag*z.imag) < 4.0
@vectorize([complex64(complex64, complex64)], target='parallel')
def g(z,c):
return z*z + c
def mandelbrot_numpy(c, maxiter):
output = np.empty(c.shape, np.int)
z = np.zeros(c.shape, np.complex64)
for it in range(maxiter):
notdone = f(z)
output[notdone] = it
z[notdone] = g(z[notdone],c[notdone])
return output
Not paying off.
%timeit mandelbrot_set2(-2.0,0.5,-1.25,1.25,1000,1000,80)
1 loops, best of 3: 793 ms per loop
%timeit mandelbrot_set2(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
1 loops, best of 3: 27.3 s per loop
We should parallelize the top loop. We can do it via guvectorize.
Let's try guvectorize too. It amounts to reusing the sequential code.
import numpy as np
from numba import jit, vectorize, guvectorize, float64, complex64, int32, float32
@jit(int32(complex64, int32))
def mandelbrot(c,maxiter):
nreal = 0
real = 0
imag = 0
for n in range(maxiter):
nreal = real*real - imag*imag + c.real
imag = 2* real*imag + c.imag
real = nreal;
if real * real + imag * imag > 4.0:
return n
return 0
@guvectorize([(complex64[:], int32[:], int32[:])], '(n),()->(n)',target='parallel')
def mandelbrot_numpy(c, maxit, output):
maxiter = maxit[0]
for i in range(c.shape[0]):
output[i] = mandelbrot(c[i],maxiter)
def mandelbrot_set2(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
c = r1 + r2[:,None]*1j
n3 = mandelbrot_numpy(c,maxiter)
return (r1,r2,n3.T)
Way faster than the sequential code compiled with Numba.
%timeit mandelbrot_set2(-2.0,0.5,-1.25,1.25,1000,1000,80)
10 loops, best of 3: 43 ms per loop
%timeit mandelbrot_set2(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
1 loops, best of 3: 1.02 s per loop
Check
mandelbrot_set = mandelbrot_set2
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
Let's try Cuda target. Since there is a bug, we must pass the iteration max as an array too.
import numpy as np
from numba import jit, vectorize, guvectorize, float64, complex64, int32, float32
@jit(int32(complex64, int32))
def mandelbrot(c,maxiter):
creal = c.real
cimag = c.imag
real = creal
imag = cimag
for n in range(maxiter):
real2 = real*real
imag2 = imag*imag
if real2 + imag2 > 4.0:
return n
imag = 2* real*imag + cimag
real = real2 - imag2 + creal
return 0
@guvectorize([(complex64[:], int32[:], int32[:])], '(n),(n)->(n)', target='cuda')
def mandelbrot_numpy(c, maxit, output):
maxiter = maxit[0]
for i in range(c.shape[0]):
creal = c[i].real
cimag = c[i].imag
real = creal
imag = cimag
output[i] = 0
for n in range(maxiter):
real2 = real*real
imag2 = imag*imag
if real2 + imag2 > 4.0:
output[i] = n
break
imag = 2* real*imag + cimag
real = real2 - imag2 + creal
def mandelbrot_set2(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
c = r1 + r2[:,None]*1j
n3 = np.empty(c.shape, int)
maxit = np.ones(c.shape, int) * maxiter
n3 = mandelbrot_numpy(c,maxit)
return (r1,r2,n3.T)
Slightly slower.
%timeit mandelbrot_set2(-2.0,0.5,-1.25,1.25,1000,1000,80)
10 loops, best of 3: 53.7 ms per loop
%timeit mandelbrot_set2(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
1 loops, best of 3: 906 ms per loop
Check
mandelbrot_set = mandelbrot_set2
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
Code from https://www.tensorflow.org/versions/master/tutorials/mandelbrot/index.html
Timings on my machine:
5.29394221306
34.6283330917
We reuse the boilerplate code from PyOpenCl documentation as much as possible.
from __future__ import absolute_import
from __future__ import print_function
import pyopencl as cl
%load_ext pyopencl.ipython_ext
We create a context interactively in order to select which device to use.
ctx = cl.create_some_context(interactive=True)
devices = ctx.get_info(cl.context_info.DEVICES)
print(devices)
[<pyopencl.Device 'Intel(R) Core(TM) i7-2760QM CPU @ 2.40GHz' on 'Intel(R) OpenCL' at 0x407ca20>]
The code is moving the heavy duty piece to a piece of C code thta gets compiled and run on the selected device.
Input and output for that code is handled in Python.
def mandelbrot_gpu(q, maxiter):
global ctx
queue = cl.CommandQueue(ctx)
output = np.empty(q.shape, dtype=np.uint16)
prg = cl.Program(ctx, """
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
__kernel void mandelbrot(__global float2 *q,
__global ushort *output, ushort const maxiter)
{
int gid = get_global_id(0);
float nreal, real = 0;
float imag = 0;
output[gid] = 0;
for(int curiter = 0; curiter < maxiter; curiter++) {
nreal = real*real - imag*imag + q[gid].x;
imag = 2* real*imag + q[gid].y;
real = nreal;
if (real*real + imag*imag > 4.0f){
output[gid] = curiter;
break;
}
}
}
""").build()
mf = cl.mem_flags
q_opencl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
output_opencl = cl.Buffer(ctx, mf.WRITE_ONLY, output.nbytes)
prg.mandelbrot(queue, output.shape, None, q_opencl,
output_opencl, np.uint16(maxiter))
cl.enqueue_copy(queue, output, output_opencl).wait()
return output
def mandelbrot_set3(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
c = r1 + r2[:,None]*1j
c = np.ravel(c)
n3 = mandelbrot_gpu(c,maxiter)
n3 = n3.reshape((width,height))
return (r1,r2,n3.T)
Following code is faster on gpu, above code is faster on cpu.
def mandelbrot_gpu(q, maxiter):
global ctx
queue = cl.CommandQueue(ctx)
output = np.empty(q.shape, dtype=np.uint16)
prg = cl.Program(ctx, """
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
__kernel void mandelbrot(__global float2 *q,
__global ushort *output, ushort const maxiter)
{
int gid = get_global_id(0);
float real = q[gid].x;
float imag = q[gid].y;
output[gid] = 0;
for(int curiter = 0; curiter < maxiter; curiter++) {
float real2 = real*real, imag2 = imag*imag;
if (real*real + imag*imag > 4.0f){
output[gid] = curiter;
return;
}
imag = 2* real*imag + q[gid].y;
real = real2 - imag2 + q[gid].x;
}
}
""").build()
mf = cl.mem_flags
q_opencl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
output_opencl = cl.Buffer(ctx, mf.WRITE_ONLY, output.nbytes)
prg.mandelbrot(queue, output.shape, None, q_opencl,
output_opencl, np.uint16(maxiter))
cl.enqueue_copy(queue, output, output_opencl).wait()
return output
def mandelbrot_set3(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
c = r1 + r2[:,None]*1j
c = np.ravel(c)
n3 = mandelbrot_gpu(c,maxiter)
n3 = n3.reshape((width,height))
return (r1,r2,n3.T)
This is way faster than the rest
Let's check it is correct
mandelbrot_set = mandelbrot_set3
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
C:\Users\IBM_ADMIN\Anaconda3\lib\site-packages\pyopencl\__init__.py:206: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more. "to see more.", CompilerWarning)
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')
C:\Users\IBM_ADMIN\Anaconda3\lib\site-packages\pyopencl\__init__.py:206: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more. "to see more.", CompilerWarning)
%timeit mandelbrot_set3(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set3(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
C:\Users\IBM_ADMIN\Anaconda3\lib\site-packages\pyopencl\__init__.py:206: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more. "to see more.", CompilerWarning) C:\Users\IBM_ADMIN\Anaconda3\lib\site-packages\pyopencl\__init__.py:206: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more. "to see more.", CompilerWarning)
10 loops, best of 3: 20.6 ms per loop 10 loops, best of 3: 173 ms per loop
Modified from http://wiki.tiker.net/PyCuda/Examples/Mandelbrot
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
from pycuda.elementwise import ElementwiseKernel
complex_gpu1 = ElementwiseKernel(
"pycuda::complex<float> *q, int *output, int maxiter",
"""
{
float nreal, real = 0;
float imag = 0;
output[i] = 0;
for(int curiter = 0; curiter < maxiter; curiter++) {
nreal = real*real - imag*imag + q[i].real();
imag = 2* real*imag + q[i].imag();
real = nreal;
if (real*real + imag*imag > 4.0f){
output[i] = curiter;
break;
};
};
}
""",
"complex5",
preamble="#include <pycuda-complex.hpp>",)
complex_gpu = ElementwiseKernel(
"pycuda::complex<float> *q, int *output, int maxiter",
"""
{
float nreal, real = 0;
float imag = 0;
output[i] = 0;
for(int curiter = 0; curiter < maxiter; curiter++) {
float real2 = real*real;
float imag2 = imag*imag;
nreal = real2 - imag2 + q[i].real();
imag = 2* real*imag + q[i].imag();
real = nreal;
if (real2 + imag2 > 4.0f){
output[i] = curiter;
break;
};
};
}
""",
"complex5",
preamble="#include <pycuda-complex.hpp>",)
complex_gpu1 = ElementwiseKernel(
"pycuda::complex<float> *q, int *output, int maxiter",
"""
{
float real = q[i].real();
float imag = q[i].imag();
output[i] = 0;
for(int curiter = 0; curiter < maxiter; curiter++) {
float real2 = real*real;
float imag2 = imag*imag;
if (real2 + imag2 > 4.0f){
output[i] = curiter;
break;
};
imag = 2* real*imag + q[i].imag();
real = real2 - imag2 + q[i].real();
};
}
""",
"complex5",
preamble="#include <pycuda-complex.hpp>",)
def mandelbrot_gpu(c, maxiter):
q_gpu = gpuarray.to_gpu(c.astype(np.complex64))
iterations_gpu = gpuarray.to_gpu(np.empty(c.shape, dtype=np.int))
complex_gpu(q_gpu, iterations_gpu, maxiter)
return iterations_gpu.get()
Let's check it is correct
mandelbrot_set = mandelbrot_set3
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
C:\Users\IBM_ADMIN\Anaconda3\lib\site-packages\pyopencl\__init__.py:206: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more. "to see more.", CompilerWarning)
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')
%timeit mandelbrot_set3(-2.0,0.5,-1.25,1.25,1000,1000,80)
10 loops, best of 3: 31.9 ms per loop
%timeit mandelbrot_set3(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
1 loops, best of 3: 193 ms per loop
Total running time isn't the only way to look at results. Time per iteration is also interesting. We have two flavors of code, those who loop first on pixels, then on iteration of complex computation, and those who loop first on the complex iteration using array operations. The average number of iteration is way lower in the former case. Let's compute it.
We must be careful though, as we set to 0 the pixels that did not escape horizon. let's use a code variant that does not do this.
from numba import jit
@jit
def mandelbrot(creal,cimag,maxiter):
real = creal
imag = cimag
for n in range(maxiter):
real2 = real*real
imag2 = imag*imag
if real2 + imag2 > 4.0:
return n
imag = 2* real*imag + cimag
real = real2 - imag2 + creal
return maxiter
@jit
def mandelbrot_set4(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width)
r2 = np.linspace(ymin, ymax, height)
n3 = np.empty((width,height))
for i in range(width):
for j in range(height):
n3[i,j] = mandelbrot(r1[i],r2[j],maxiter)
return (r1,r2,n3)
(r1,r2,n3) = mandelbrot_set4(-2.0,0.5,-1.25,1.25,1000,1000,80)
print(n3.mean())
(r1,r2,n3) = mandelbrot_set4(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
print(n3.mean())
23.46311 582.160638