A revisit of the Python micro benchmarks available at https://github.com/JuliaLang/julia/blob/master/test/perf/micro/perf.py
The code used in this notebook is discussed at length in Python Meets Julia Micro Performance
Timings below are for Anaconda 64 bits with Python 3.5.1 and Numba 0.23.1 running on a Windows laptop (Lenovo W520). Timings on another machine with another Python version can be quite different.
First, let's load useful extensions and import useful modules
%load_ext Cython
%load_ext line_profiler
import random
import numpy as np
from numba import jit
import sys
if sys.version_info < (3,):
range = xrange
The base line is gthe one used in Julia benchmarks. It is a straightforward, frecursive implementation.
def fib(n):
if n<2:
return n
return fib(n-1)+fib(n-2)
Sanity check.
assert fib(20) == 6765
Let's try various versions.
%%cython
def fib_cython(n):
if n<2:
return n
return fib_cython(n-1)+fib_cython(n-2)
cpdef inline fib_cython_inline(n):
if n<2:
return n
return fib_cython_inline(n-1)+fib_cython_inline(n-2)
cpdef long fib_cython_type(long n):
if n<2:
return n
return fib_cython_type(n-1)+fib_cython_type(n-2)
cpdef long fib_cython_type_inline(long n):
if n<2:
return n
return fib_cython_type_inline(n-1)+fib_cython_type_inline(n-2)
Sanity checks
assert fib_cython(20) == 6765
assert fib_cython_inline(20) == 6765
assert fib_cython_type(20) == 6765
assert fib_cython_type_inline(20) == 6765
Let's try three versions, with and without type.
from numba import int32, int64
@jit
def fib_numba(n):
if n<2:
return n
return fib_numba(n-1)+fib_numba(n-2)
@jit(int32(int32))
def fib_numba_type32(n):
if n<2:
return n
return fib_numba_type32(n-1)+fib_numba_type32(n-2)
@jit(int64(int64))
def fib_numba_type64(n):
if n<2:
return n
return fib_numba_type64(n-1)+fib_numba_type64(n-2)
Sanity check
assert fib_numba(20) == 6765
assert fib_numba_type32(20) == 6765
assert fib_numba_type64(20) == 6765
def ffib(n):
if n<2.0:
return n
return ffib(n-1)+ffib(n-2)
%timeit fib(20)
%timeit fib_cython(20)
%timeit fib_cython_inline(20)
%timeit fib_cython_type(20)
%timeit fib_cython_type_inline(20)
%timeit fib_numba(20)
%timeit fib_numba_type32(20)
%timeit fib_numba_type64(20)
%timeit ffib(20.0)
100 loops, best of 3: 3.77 ms per loop 1000 loops, best of 3: 1.47 ms per loop 1000 loops, best of 3: 759 µs per loop 10000 loops, best of 3: 24.4 µs per loop 10000 loops, best of 3: 24.6 µs per loop 100 loops, best of 3: 4.89 ms per loop 100 loops, best of 3: 5 ms per loop 100 loops, best of 3: 5.01 ms per loop 100 loops, best of 3: 5.26 ms per loop
Let's try a larger one.
n = 30
%timeit fib(n)
%timeit fib_cython(n)
%timeit fib_cython_inline(n)
%timeit fib_cython_type(n)
%timeit fib_cython_type_inline(n)
%timeit fib_numba(n)
%timeit ffib(n*1.0)
1 loops, best of 3: 442 ms per loop 10 loops, best of 3: 178 ms per loop 10 loops, best of 3: 94.2 ms per loop 100 loops, best of 3: 3.05 ms per loop 100 loops, best of 3: 3.01 ms per loop 1 loops, best of 3: 600 ms per loop 1 loops, best of 3: 648 ms per loop
The recursive call is not the best way to compute Fibonacci numbers. It is better to accumulate them as follows.
from functools import lru_cache as cache
@cache(maxsize=None)
def fib_cache(n):
if n<2:
return n
return fib_cache(n-1)+fib_cache(n-2)
def fib_seq(n):
if n < 2:
return n
a,b = 1,0
for i in range(n-1):
a,b = a+b,a
return a
@jit
def fib_seq_numba(n):
if n < 2:
return n
a,b = 1,0
for i in range(n-1):
a,b = a+b,a
return a
%%cython
def fib_seq_cython(n):
if n < 2:
return n
a,b = 1,0
for i in range(n-1):
a,b = a+b,a
return a
cpdef long fib_seq_cython_type(long n):
if n < 2:
return n
cdef long a,b
a,b = 1,0
for i in range(n-1):
a,b = a+b,a
return a
assert fib_seq(20) == 6765
assert fib_seq_cython(20) == 6765
assert fib_seq_cython_type(20) == 6765
assert fib_seq_numba(20) == 6765
%timeit fib_cache(20)
%timeit fib_seq(20)
%timeit fib_seq_cython(20)
%timeit fib_seq_cython_type(20)
%timeit fib_seq_numba(20)
The slowest run took 188.04 times longer than the fastest. This could mean that an intermediate result is being cached 10000000 loops, best of 3: 127 ns per loop 1000000 loops, best of 3: 1.81 µs per loop The slowest run took 4.46 times longer than the fastest. This could mean that an intermediate result is being cached 1000000 loops, best of 3: 959 ns per loop The slowest run took 15.66 times longer than the fastest. This could mean that an intermediate result is being cached 10000000 loops, best of 3: 82 ns per loop The slowest run took 31.77 times longer than the fastest. This could mean that an intermediate result is being cached 1000000 loops, best of 3: 216 ns per loop
fib_seq(100)
354224848179261915075
Code used in Julia benchmarks
def qsort_kernel(a, lo, hi):
i = lo
j = hi
while i < hi:
pivot = a[(lo+hi) // 2]
while i <= j:
while a[i] < pivot:
i += 1
while a[j] > pivot:
j -= 1
if i <= j:
a[i], a[j] = a[j], a[i]
i += 1
j -= 1
if lo < j:
qsort_kernel(a, lo, j)
lo = i
j = hi
return a
def benchmark_qsort():
lst = [ random.random() for i in range(1,5000) ]
qsort_kernel(lst, 0, len(lst)-1)
@jit
def qsort_kernel_numba(a, lo, hi):
i = lo
j = hi
while i < hi:
pivot = a[(lo+hi) // 2]
while i <= j:
while a[i] < pivot:
i += 1
while a[j] > pivot:
j -= 1
if i <= j:
a[i], a[j] = a[j], a[i]
i += 1
j -= 1
if lo < j:
qsort_kernel_numba(a, lo, j)
lo = i
j = hi
return a
def benchmark_qsort_numba():
lst = [ random.random() for i in range(1,5000) ]
qsort_kernel_numba(lst, 0, len(lst)-1)
With Numpy
@jit
def qsort_kernel_numba_numpy(a, lo, hi):
i = lo
j = hi
while i < hi:
pivot = a[(lo+hi) // 2]
while i <= j:
while a[i] < pivot:
i += 1
while a[j] > pivot:
j -= 1
if i <= j:
a[i], a[j] = a[j], a[i]
i += 1
j -= 1
if lo < j:
qsort_kernel_numba_numpy(a, lo, j)
lo = i
j = hi
return a
def benchmark_qsort_numba_numpy():
lst = np.random.rand(5000)
qsort_kernel_numba(lst, 0, len(lst)-1)
%%cython
import numpy as np
import cython
@cython.boundscheck(False)
@cython.wraparound(False)
cdef double[:] \
qsort_kernel_cython_numpy_type(double[:] a, \
long lo, \
long hi):
cdef:
long i, j
double pivot
i = lo
j = hi
while i < hi:
pivot = a[(lo+hi) // 2]
while i <= j:
while a[i] < pivot:
i += 1
while a[j] > pivot:
j -= 1
if i <= j:
a[i], a[j] = a[j], a[i]
i += 1
j -= 1
if lo < j:
qsort_kernel_cython_numpy_type(a, lo, j)
lo = i
j = hi
return a
def benchmark_qsort_numpy_cython():
lst = np.random.rand(5000)
qsort_kernel_cython_numpy_type(lst, 0, len(lst)-1)
%timeit benchmark_qsort()
%timeit benchmark_qsort_numba()
%timeit benchmark_qsort_numba_numpy()
%timeit benchmark_qsort_numpy_cython()
10 loops, best of 3: 17.7 ms per loop The slowest run took 4.06 times longer than the fastest. This could mean that an intermediate result is being cached 1 loops, best of 3: 79.4 ms per loop The slowest run took 12.62 times longer than the fastest. This could mean that an intermediate result is being cached 1 loops, best of 3: 20.9 ms per loop 1000 loops, best of 3: 772 µs per loop
def benchmark_sort():
lst = [ random.random() for i in range(1,5000) ]
lst.sort()
def benchmark_sort_numpy():
lst = np.random.rand(5000)
np.sort(lst)
%timeit benchmark_sort()
%timeit benchmark_sort_numpy()
1000 loops, best of 3: 2 ms per loop 1000 loops, best of 3: 306 µs per loop
Used in Julia benchmarks
def randmatstat(t):
n = 5
v = np.zeros(t)
w = np.zeros(t)
for i in range(1,t):
a = np.random.randn(n, n)
b = np.random.randn(n, n)
c = np.random.randn(n, n)
d = np.random.randn(n, n)
P = np.matrix(np.hstack((a, b, c, d)))
Q = np.matrix(np.vstack((np.hstack((a, b)), np.hstack((c, d)))))
v[i] = np.trace(np.linalg.matrix_power(np.transpose(P)*P, 4))
w[i] = np.trace(np.linalg.matrix_power(np.transpose(Q)*Q, 4))
return (np.std(v)/np.mean(v), np.std(w)/np.mean(w))
def my_power(a):
a = np.dot(a.T,a)
a = np.dot(a,a)
a = np.dot(a,a)
return a
def randmatstat_fast(t):
n = 5
v = np.zeros(t)
w = np.zeros(t)
for i in range(1,t):
a = np.random.randn(n, n)
b = np.random.randn(n, n)
c = np.random.randn(n, n)
d = np.random.randn(n, n)
P = np.hstack((a, b, c, d))
Q = np.vstack((np.hstack((a, b)), np.hstack((c, d))))
v[i] = np.trace(my_power(P))
w[i] = np.trace(my_power(Q))
return (np.std(v)/np.mean(v), np.std(w)/np.mean(w))
%timeit randmatstat(1000)
%timeit randmatstat_fast(1000)
10 loops, best of 3: 160 ms per loop 10 loops, best of 3: 83.2 ms per loop
def randmatmul(n):
A = np.random.rand(n,n)
B = np.random.rand(n,n)
return np.dot(A,B)
%timeit randmatmul(1000)
10 loops, best of 3: 83.7 ms per loop
The baseline used in Julia benchmarks
def mandel(z):
maxiter = 80
c = z
for n in range(maxiter):
if abs(z) > 2:
return n
z = z*z + c
return maxiter
def mandelperf():
r1 = np.linspace(-2.0, 0.5, 26)
r2 = np.linspace(-1.0, 1.0, 21)
return [mandel(complex(r, i)) for r in r1 for i in r2]
assert sum(mandelperf()) == 14791
@jit
def mandel_numba(z):
maxiter = 80
c = z
for n in range(maxiter):
if abs(z) > 2:
return n
z = z*z + c
return maxiter
def mandelperf_numba():
r1 = np.linspace(-2.0, 0.5, 26)
r2 = np.linspace(-1.0, 1.0, 21)
c3 = [complex(r,i) for r in r1 for i in r2]
return [mandel_numba(c) for c in c3]
assert sum(mandelperf_numba()) == 14791
%%cython
import numpy as np
cimport numpy as np
cdef long mandel_cython_type(np.complex z):
cdef long maxiter, n
cdef np.complex c
maxiter = 80
c = z
for n in range(maxiter):
if abs(z) > 2:
return n
z = z*z + c
return maxiter
cpdef mandelperf_cython_type():
cdef double[:] r1,r2
r1 = np.linspace(-2.0, 0.5, 26)
r2 = np.linspace(-1.0, 1.0, 21)
return [mandel_cython_type(complex(r, i)) for r in r1 for i in r2]
assert sum(mandelperf_cython_type()) == 14791
%timeit mandelperf()
%timeit mandelperf_cython_type()
%timeit mandelperf_numba()
100 loops, best of 3: 6.57 ms per loop 100 loops, best of 3: 3.6 ms per loop 1000 loops, best of 3: 481 µs per loop
%lprun -s -f mandelperf_numba mandelperf_numba()
@jit
def mandel_numba(z):
maxiter = 80
c = z
for n in range(maxiter):
if abs(z) > 2:
return n
z = z*z + c
return maxiter
@jit
def mandelperf_numba_mesh():
width = 26
height = 21
r1 = np.linspace(-2.0, 0.5, width)
r2 = np.linspace(-1.0, 1.0, height)
mandel_set = np.empty((width,height), dtype=int)
for i in range(width):
for j in range(height):
mandel_set[i,j] = mandel_numba(r1[i] + 1j*r2[j])
return mandel_set
assert np.sum(mandelperf_numba_mesh()) == 14791
%timeit mandelperf_numba_mesh()
10000 loops, best of 3: 126 µs per loop
An even faster code is presented in How To Quickly Compute The Mandelbrot Set In Python
def pisum(t):
sum = 0.0
for j in range(1, 501):
sum = 0.0
for k in range(1, t+1):
sum += 1.0/(k*k)
return sum
@jit
def pisum_numba(t):
sum = 0.0
for j in range(1, 501):
sum = 0.0
for k in range(1, t+1):
sum += 1.0/(k*k)
return sum
%%cython
import cython
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef double pisum_cython(int t):
cdef:
double sum = 0.0
int j,k
for j in range(1, 501):
sum = 0.0
for k in range(1, t+1):
sum += 1.0/(k*k)
return sum
assert abs(pisum(10000)-1.644834071848065) < 1e-6
assert abs(pisum_numba(10000)-1.644834071848065) < 1e-6
assert abs(pisum_cython(10000)-1.644834071848065) < 1e-6
%timeit pisum(10000)
%timeit pisum_numba(10000)
%timeit pisum_cython(10000)
1 loops, best of 3: 926 ms per loop 100 loops, best of 3: 20.4 ms per loop 10 loops, best of 3: 38.2 ms per loop
def parse_int():
for i in range(1,1000):
n = random.randint(0,2**32-1)
s = hex(n)
# required for Python 2.7 on 64 bits machine
if s[-1]=='L':
s = s[0:-1]
m = int(s,16)
assert m == n
def parse_int_numpy():
for i in range(1,1000):
n = np.random.randint(0,2**31-1)
s = hex(n)
# required for Python 2.7 on 64 bits machine
if s[-1]=='L':
s = s[0:-1]
m = int(s,16)
assert m == n
def parse_int_opt():
for i in range(1,1000):
n = random.randint(0,2**32-1)
s = hex(n)
# required for Python 2.7 on 64 bits machine
if s.endswith('L'):
s = s[0:-1]
m = int(s,16)
assert m == n
def parse_int1():
for i in range(1,1000):
n = random.randint(0,2**32-1)
s = hex(n)
m = int(s,16)
assert m == n
def parse_int1_numpy():
for i in range(1,1000):
n = np.random.randint(0,2**31-1)
s = hex(n)
m = int(s,16)
assert m == n
@jit
def parse_numba():
for i in range(1,1000):
n = random.randint(0,2**32-1)
s = hex(n)
m = int(s,16)
assert m == n
@jit
def parse_numba_numpy():
for i in range(1,1000):
n = np.random.randint(0,2**31-1)
s = hex(n)
m = int(s,16)
assert m == n
Int in C are up to 2^31-1, hence we must use this as the limit when using Cython or Numpy
%%cython
import random
import cython
cimport cython
import numpy as np
cimport numpy as np
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef parse_int_cython():
cdef:
int i,n,m
for i in range(1,1000):
n = random.randint(0,2**31-1)
m = int(hex(n),16)
assert m == n
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef parse_int_cython_numpy():
cdef:
int i,n,m
for i in range(1,1000):
n = np.random.randint(0,2**31-1)
m = int(hex(n),16)
assert m == n
%timeit parse_int()
%timeit parse_int_numpy()
%timeit parse_int_opt()
%timeit parse_int1()
%timeit parse_int1_numpy()
%timeit parse_numba()
%timeit parse_numba_numpy()
%timeit parse_int_cython()
%timeit parse_int_cython_numpy()
100 loops, best of 3: 3.55 ms per loop 1000 loops, best of 3: 1.21 ms per loop 100 loops, best of 3: 3.72 ms per loop 100 loops, best of 3: 3.32 ms per loop 1000 loops, best of 3: 1.05 ms per loop 100 loops, best of 3: 3.63 ms per loop 1000 loops, best of 3: 1.13 ms per loop 100 loops, best of 3: 3.07 ms per loop 1000 loops, best of 3: 817 µs per loop
%lprun -s -f parse_int parse_int()
import numpy as np
def parse_int_vec():
n = np.random.randint(2^31-1,size=1000)
for i in range(1,1000):
ni = n[i]
s = hex(ni)
m = int(s,16)
assert m == ni
@jit
def parse_int_vec_numba():
n = np.random.randint(2^31-1,size=1000)
for i in range(1,1000):
ni = n[i]
s = hex(ni)
m = int(s,16)
assert m == ni
vhex = np.vectorize(hex)
vint = np.vectorize(int)
def parse_int_numpy():
n = np.random.randint(0,2**31-1,1000)
s = vhex(n)
m = vint(s,16)
np.all(m == n)
return s
@jit
def parse_int_numpy_numba():
n = np.random.randint(0,2**31-1,1000)
s = vhex(n)
m = vint(s,16)
np.all(m == n)
%%cython
import numpy as np
import cython
cimport numpy
cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef parse_int_vec_cython():
cdef:
int i,m
int[:] n
n = np.random.randint(0,2**31-1,1000)
for i in range(1,1000):
m = int(hex(n[i]),16)
assert m == n[i]
%timeit parse_int_vec()
%timeit parse_int_vec_numba()
%timeit parse_int_numpy()
%timeit parse_int_numpy_numba()
%timeit parse_int_vec_cython()
1000 loops, best of 3: 781 µs per loop The slowest run took 195.93 times longer than the fastest. This could mean that an intermediate result is being cached 1000 loops, best of 3: 824 µs per loop 1000 loops, best of 3: 733 µs per loop The slowest run took 104.18 times longer than the fastest. This could mean that an intermediate result is being cached 1000 loops, best of 3: 739 µs per loop 1000 loops, best of 3: 471 µs per loop
That's it!