#!/usr/bin/env python # coding: utf-8 # # A Python Performance Optimization Exercise # # ## Author [Jean-François Puget](https://www.ibm.com/developerworks/community/blogs/jfp/?lang=en) # # A revisit of the Python micro benchmarks available at https://github.com/JuliaLang/julia/blob/master/test/perf/micro/perf.py # # The code used in this notebook is discussed at length in [Python Meets Julia Micro Performance](https://www.ibm.com/developerworks/community/blogs/jfp/entry/python_meets_julia_micro_performance) # # Timings below are for Anaconda 64 bits with Python 3.5.1 and Numba 0.23.1 running on a Windows laptop (Lenovo W520). Timings on another machine with another Python version can be quite different. # First, let's load useful extensions and import useful modules # In[1]: get_ipython().run_line_magic('load_ext', 'Cython') get_ipython().run_line_magic('load_ext', 'line_profiler') import random import numpy as np from numba import jit import sys if sys.version_info < (3,): range = xrange # # Fibonacci # ### Naive code # # The base line is gthe one used in Julia benchmarks. It is a straightforward, frecursive implementation. # In[2]: def fib(n): if n<2: return n return fib(n-1)+fib(n-2) # Sanity check. # In[3]: assert fib(20) == 6765 # ### Cython # # Let's try various versions. # In[4]: get_ipython().run_cell_magic('cython', '', 'def fib_cython(n):\n if n<2:\n return n\n return fib_cython(n-1)+fib_cython(n-2)\n\ncpdef inline fib_cython_inline(n):\n if n<2:\n return n\n return fib_cython_inline(n-1)+fib_cython_inline(n-2)\n\n\ncpdef long fib_cython_type(long n):\n if n<2:\n return n\n return fib_cython_type(n-1)+fib_cython_type(n-2)\n\ncpdef long fib_cython_type_inline(long n):\n if n<2:\n return n\n return fib_cython_type_inline(n-1)+fib_cython_type_inline(n-2)\n') # Sanity checks # In[5]: assert fib_cython(20) == 6765 assert fib_cython_inline(20) == 6765 assert fib_cython_type(20) == 6765 assert fib_cython_type_inline(20) == 6765 # ### Numba # # Let's try three versions, with and without type. # In[6]: from numba import int32, int64 @jit def fib_numba(n): if n<2: return n return fib_numba(n-1)+fib_numba(n-2) @jit(int32(int32)) def fib_numba_type32(n): if n<2: return n return fib_numba_type32(n-1)+fib_numba_type32(n-2) @jit(int64(int64)) def fib_numba_type64(n): if n<2: return n return fib_numba_type64(n-1)+fib_numba_type64(n-2) # Sanity check # In[7]: assert fib_numba(20) == 6765 assert fib_numba_type32(20) == 6765 assert fib_numba_type64(20) == 6765 # ### Using floats # In[8]: def ffib(n): if n<2.0: return n return ffib(n-1)+ffib(n-2) # ### Timings # In[9]: get_ipython().run_line_magic('timeit', 'fib(20)') get_ipython().run_line_magic('timeit', 'fib_cython(20)') get_ipython().run_line_magic('timeit', 'fib_cython_inline(20)') get_ipython().run_line_magic('timeit', 'fib_cython_type(20)') get_ipython().run_line_magic('timeit', 'fib_cython_type_inline(20)') get_ipython().run_line_magic('timeit', 'fib_numba(20)') get_ipython().run_line_magic('timeit', 'fib_numba_type32(20)') get_ipython().run_line_magic('timeit', 'fib_numba_type64(20)') get_ipython().run_line_magic('timeit', 'ffib(20.0)') # Let's try a larger one. # In[10]: n = 30 get_ipython().run_line_magic('timeit', 'fib(n)') get_ipython().run_line_magic('timeit', 'fib_cython(n)') get_ipython().run_line_magic('timeit', 'fib_cython_inline(n)') get_ipython().run_line_magic('timeit', 'fib_cython_type(n)') get_ipython().run_line_magic('timeit', 'fib_cython_type_inline(n)') get_ipython().run_line_magic('timeit', 'fib_numba(n)') get_ipython().run_line_magic('timeit', 'ffib(n*1.0)') # ## Sequential Fibonacci # # The recursive call is not the best way to compute Fibonacci numbers. It is better to accumulate them as follows. # ### Functools # In[51]: from functools import lru_cache as cache @cache(maxsize=None) def fib_cache(n): if n<2: return n return fib_cache(n-1)+fib_cache(n-2) # ### Plain Python # In[11]: def fib_seq(n): if n < 2: return n a,b = 1,0 for i in range(n-1): a,b = a+b,a return a # ### Numba # In[12]: @jit def fib_seq_numba(n): if n < 2: return n a,b = 1,0 for i in range(n-1): a,b = a+b,a return a # ### Cython # In[13]: get_ipython().run_cell_magic('cython', '', '\ndef fib_seq_cython(n):\n if n < 2:\n return n\n a,b = 1,0\n for i in range(n-1):\n a,b = a+b,a\n return a \n\ncpdef long fib_seq_cython_type(long n):\n if n < 2:\n return n\n cdef long a,b\n a,b = 1,0\n for i in range(n-1):\n a,b = a+b,a\n return a \n') # ### Sanity checks # In[14]: assert fib_seq(20) == 6765 assert fib_seq_cython(20) == 6765 assert fib_seq_cython_type(20) == 6765 assert fib_seq_numba(20) == 6765 # ### Timings # In[52]: get_ipython().run_line_magic('timeit', 'fib_cache(20)') get_ipython().run_line_magic('timeit', 'fib_seq(20)') get_ipython().run_line_magic('timeit', 'fib_seq_cython(20)') get_ipython().run_line_magic('timeit', 'fib_seq_cython_type(20)') get_ipython().run_line_magic('timeit', 'fib_seq_numba(20)') # ### Arbitrary precision check # In[16]: fib_seq(100) # # Quicksort # ### Pure Python # # Code used in Julia benchmarks # In[17]: def qsort_kernel(a, lo, hi): i = lo j = hi while i < hi: pivot = a[(lo+hi) // 2] while i <= j: while a[i] < pivot: i += 1 while a[j] > pivot: j -= 1 if i <= j: a[i], a[j] = a[j], a[i] i += 1 j -= 1 if lo < j: qsort_kernel(a, lo, j) lo = i j = hi return a def benchmark_qsort(): lst = [ random.random() for i in range(1,5000) ] qsort_kernel(lst, 0, len(lst)-1) # ### Numba # In[18]: @jit def qsort_kernel_numba(a, lo, hi): i = lo j = hi while i < hi: pivot = a[(lo+hi) // 2] while i <= j: while a[i] < pivot: i += 1 while a[j] > pivot: j -= 1 if i <= j: a[i], a[j] = a[j], a[i] i += 1 j -= 1 if lo < j: qsort_kernel_numba(a, lo, j) lo = i j = hi return a def benchmark_qsort_numba(): lst = [ random.random() for i in range(1,5000) ] qsort_kernel_numba(lst, 0, len(lst)-1) # With Numpy # In[19]: @jit def qsort_kernel_numba_numpy(a, lo, hi): i = lo j = hi while i < hi: pivot = a[(lo+hi) // 2] while i <= j: while a[i] < pivot: i += 1 while a[j] > pivot: j -= 1 if i <= j: a[i], a[j] = a[j], a[i] i += 1 j -= 1 if lo < j: qsort_kernel_numba_numpy(a, lo, j) lo = i j = hi return a def benchmark_qsort_numba_numpy(): lst = np.random.rand(5000) qsort_kernel_numba(lst, 0, len(lst)-1) # ### Cython # In[20]: get_ipython().run_cell_magic('cython', '', 'import numpy as np\nimport cython\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ncdef double[:] \\\nqsort_kernel_cython_numpy_type(double[:] a, \\\n long lo, \\\n long hi):\n cdef: \n long i, j\n double pivot\n i = lo\n j = hi\n while i < hi:\n pivot = a[(lo+hi) // 2]\n while i <= j:\n while a[i] < pivot:\n i += 1\n while a[j] > pivot:\n j -= 1\n if i <= j:\n a[i], a[j] = a[j], a[i]\n i += 1\n j -= 1\n if lo < j:\n qsort_kernel_cython_numpy_type(a, lo, j)\n lo = i\n j = hi\n return a\n\ndef benchmark_qsort_numpy_cython():\n lst = np.random.rand(5000)\n qsort_kernel_cython_numpy_type(lst, 0, len(lst)-1)\n \n') # In[21]: get_ipython().run_line_magic('timeit', 'benchmark_qsort()') get_ipython().run_line_magic('timeit', 'benchmark_qsort_numba()') get_ipython().run_line_magic('timeit', 'benchmark_qsort_numba_numpy()') get_ipython().run_line_magic('timeit', 'benchmark_qsort_numpy_cython()') # ### Built-in sort # In[22]: def benchmark_sort(): lst = [ random.random() for i in range(1,5000) ] lst.sort() def benchmark_sort_numpy(): lst = np.random.rand(5000) np.sort(lst) # In[23]: get_ipython().run_line_magic('timeit', 'benchmark_sort()') get_ipython().run_line_magic('timeit', 'benchmark_sort_numpy()') # # Random mat stat # ### Baseline # # Used in Julia benchmarks # In[24]: def randmatstat(t): n = 5 v = np.zeros(t) w = np.zeros(t) for i in range(1,t): a = np.random.randn(n, n) b = np.random.randn(n, n) c = np.random.randn(n, n) d = np.random.randn(n, n) P = np.matrix(np.hstack((a, b, c, d))) Q = np.matrix(np.vstack((np.hstack((a, b)), np.hstack((c, d))))) v[i] = np.trace(np.linalg.matrix_power(np.transpose(P)*P, 4)) w[i] = np.trace(np.linalg.matrix_power(np.transpose(Q)*Q, 4)) return (np.std(v)/np.mean(v), np.std(w)/np.mean(w)) # ### Speeding it up # In[25]: def my_power(a): a = np.dot(a.T,a) a = np.dot(a,a) a = np.dot(a,a) return a def randmatstat_fast(t): n = 5 v = np.zeros(t) w = np.zeros(t) for i in range(1,t): a = np.random.randn(n, n) b = np.random.randn(n, n) c = np.random.randn(n, n) d = np.random.randn(n, n) P = np.hstack((a, b, c, d)) Q = np.vstack((np.hstack((a, b)), np.hstack((c, d)))) v[i] = np.trace(my_power(P)) w[i] = np.trace(my_power(Q)) return (np.std(v)/np.mean(v), np.std(w)/np.mean(w)) # ### Timings # In[26]: get_ipython().run_line_magic('timeit', 'randmatstat(1000)') get_ipython().run_line_magic('timeit', 'randmatstat_fast(1000)') # # Randmatmul # In[27]: def randmatmul(n): A = np.random.rand(n,n) B = np.random.rand(n,n) return np.dot(A,B) # In[28]: get_ipython().run_line_magic('timeit', 'randmatmul(1000)') # # Mandelbrot # ### Baseline # The baseline used in Julia benchmarks # In[29]: def mandel(z): maxiter = 80 c = z for n in range(maxiter): if abs(z) > 2: return n z = z*z + c return maxiter def mandelperf(): r1 = np.linspace(-2.0, 0.5, 26) r2 = np.linspace(-1.0, 1.0, 21) return [mandel(complex(r, i)) for r in r1 for i in r2] # In[30]: assert sum(mandelperf()) == 14791 # ### Numba # In[31]: @jit def mandel_numba(z): maxiter = 80 c = z for n in range(maxiter): if abs(z) > 2: return n z = z*z + c return maxiter def mandelperf_numba(): r1 = np.linspace(-2.0, 0.5, 26) r2 = np.linspace(-1.0, 1.0, 21) c3 = [complex(r,i) for r in r1 for i in r2] return [mandel_numba(c) for c in c3] # In[32]: assert sum(mandelperf_numba()) == 14791 # ### Cython # In[33]: get_ipython().run_cell_magic('cython', '', '\nimport numpy as np\ncimport numpy as np\n\ncdef long mandel_cython_type(np.complex z):\n cdef long maxiter, n\n cdef np.complex c\n maxiter = 80\n c = z\n for n in range(maxiter):\n if abs(z) > 2:\n return n\n z = z*z + c\n return maxiter\n\ncpdef mandelperf_cython_type():\n cdef double[:] r1,r2\n r1 = np.linspace(-2.0, 0.5, 26)\n r2 = np.linspace(-1.0, 1.0, 21)\n return [mandel_cython_type(complex(r, i)) for r in r1 for i in r2]\n') # In[34]: assert sum(mandelperf_cython_type()) == 14791 # In[35]: get_ipython().run_line_magic('timeit', 'mandelperf()') get_ipython().run_line_magic('timeit', 'mandelperf_cython_type()') get_ipython().run_line_magic('timeit', 'mandelperf_numba()') # ### Profiling # In[36]: get_ipython().run_line_magic('lprun', '-s -f mandelperf_numba mandelperf_numba()') # ### Numpy # In[53]: @jit def mandel_numba(z): maxiter = 80 c = z for n in range(maxiter): if abs(z) > 2: return n z = z*z + c return maxiter @jit def mandelperf_numba_mesh(): width = 26 height = 21 r1 = np.linspace(-2.0, 0.5, width) r2 = np.linspace(-1.0, 1.0, height) mandel_set = np.empty((width,height), dtype=int) for i in range(width): for j in range(height): mandel_set[i,j] = mandel_numba(r1[i] + 1j*r2[j]) return mandel_set # In[54]: assert np.sum(mandelperf_numba_mesh()) == 14791 # In[55]: get_ipython().run_line_magic('timeit', 'mandelperf_numba_mesh()') # An even faster code is presented in [How To Quickly Compute The Mandelbrot Set In Python](https://www.ibm.com/developerworks/community/blogs/jfp/entry/How_To_Compute_Mandelbrodt_Set_Quickly?lang=en) # # Pisum # In[43]: def pisum(t): sum = 0.0 for j in range(1, 501): sum = 0.0 for k in range(1, t+1): sum += 1.0/(k*k) return sum @jit def pisum_numba(t): sum = 0.0 for j in range(1, 501): sum = 0.0 for k in range(1, t+1): sum += 1.0/(k*k) return sum # In[44]: get_ipython().run_cell_magic('cython', '', '\nimport cython\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ncpdef double pisum_cython(int t):\n cdef:\n double sum = 0.0\n int j,k\n for j in range(1, 501):\n sum = 0.0\n for k in range(1, t+1):\n sum += 1.0/(k*k)\n return sum\n') # In[45]: assert abs(pisum(10000)-1.644834071848065) < 1e-6 assert abs(pisum_numba(10000)-1.644834071848065) < 1e-6 assert abs(pisum_cython(10000)-1.644834071848065) < 1e-6 # In[46]: get_ipython().run_line_magic('timeit', 'pisum(10000)') get_ipython().run_line_magic('timeit', 'pisum_numba(10000)') get_ipython().run_line_magic('timeit', 'pisum_cython(10000)') # ## Parse int # In[19]: def parse_int(): for i in range(1,1000): n = random.randint(0,2**32-1) s = hex(n) # required for Python 2.7 on 64 bits machine if s[-1]=='L': s = s[0:-1] m = int(s,16) assert m == n def parse_int_numpy(): for i in range(1,1000): n = np.random.randint(0,2**31-1) s = hex(n) # required for Python 2.7 on 64 bits machine if s[-1]=='L': s = s[0:-1] m = int(s,16) assert m == n def parse_int_opt(): for i in range(1,1000): n = random.randint(0,2**32-1) s = hex(n) # required for Python 2.7 on 64 bits machine if s.endswith('L'): s = s[0:-1] m = int(s,16) assert m == n def parse_int1(): for i in range(1,1000): n = random.randint(0,2**32-1) s = hex(n) m = int(s,16) assert m == n def parse_int1_numpy(): for i in range(1,1000): n = np.random.randint(0,2**31-1) s = hex(n) m = int(s,16) assert m == n @jit def parse_numba(): for i in range(1,1000): n = random.randint(0,2**32-1) s = hex(n) m = int(s,16) assert m == n @jit def parse_numba_numpy(): for i in range(1,1000): n = np.random.randint(0,2**31-1) s = hex(n) m = int(s,16) assert m == n # Int in C are up to 2^31-1, hence we must use this as the limit when using Cython or Numpy # In[21]: get_ipython().run_cell_magic('cython', '', 'import random\nimport cython\ncimport cython\nimport numpy as np\ncimport numpy as np\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ncpdef parse_int_cython():\n cdef:\n int i,n,m\n for i in range(1,1000):\n n = random.randint(0,2**31-1)\n m = int(hex(n),16)\n assert m == n\n \n@cython.boundscheck(False)\n@cython.wraparound(False)\ncpdef parse_int_cython_numpy():\n cdef:\n int i,n,m\n for i in range(1,1000):\n n = np.random.randint(0,2**31-1)\n m = int(hex(n),16)\n assert m == n\n') # In[22]: get_ipython().run_line_magic('timeit', 'parse_int()') get_ipython().run_line_magic('timeit', 'parse_int_numpy()') get_ipython().run_line_magic('timeit', 'parse_int_opt()') get_ipython().run_line_magic('timeit', 'parse_int1()') get_ipython().run_line_magic('timeit', 'parse_int1_numpy()') get_ipython().run_line_magic('timeit', 'parse_numba()') get_ipython().run_line_magic('timeit', 'parse_numba_numpy()') get_ipython().run_line_magic('timeit', 'parse_int_cython()') get_ipython().run_line_magic('timeit', 'parse_int_cython_numpy()') # ### Vectorized operation # In[61]: get_ipython().run_line_magic('lprun', '-s -f parse_int parse_int()') # In[25]: import numpy as np def parse_int_vec(): n = np.random.randint(2^31-1,size=1000) for i in range(1,1000): ni = n[i] s = hex(ni) m = int(s,16) assert m == ni @jit def parse_int_vec_numba(): n = np.random.randint(2^31-1,size=1000) for i in range(1,1000): ni = n[i] s = hex(ni) m = int(s,16) assert m == ni # In[26]: vhex = np.vectorize(hex) vint = np.vectorize(int) def parse_int_numpy(): n = np.random.randint(0,2**31-1,1000) s = vhex(n) m = vint(s,16) np.all(m == n) return s @jit def parse_int_numpy_numba(): n = np.random.randint(0,2**31-1,1000) s = vhex(n) m = vint(s,16) np.all(m == n) # In[27]: get_ipython().run_cell_magic('cython', '', 'import numpy as np\nimport cython\ncimport numpy\ncimport cython\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ncpdef parse_int_vec_cython():\n cdef:\n int i,m\n int[:] n\n n = np.random.randint(0,2**31-1,1000)\n for i in range(1,1000):\n m = int(hex(n[i]),16)\n assert m == n[i]\n') # In[28]: get_ipython().run_line_magic('timeit', 'parse_int_vec()') get_ipython().run_line_magic('timeit', 'parse_int_vec_numba()') get_ipython().run_line_magic('timeit', 'parse_int_numpy()') get_ipython().run_line_magic('timeit', 'parse_int_numpy_numba()') get_ipython().run_line_magic('timeit', 'parse_int_vec_cython()') # That's it!