#!/usr/bin/env python
# coding: utf-8

# # A Python Performance Optimization Exercise
# 
# ## Author [Jean-François Puget](https://www.ibm.com/developerworks/community/blogs/jfp/?lang=en)
# 
# A revisit of the Python micro benchmarks available at https://github.com/JuliaLang/julia/blob/master/test/perf/micro/perf.py
# 
# The code used in this notebook is discussed at length in [Python Meets Julia Micro Performance](https://www.ibm.com/developerworks/community/blogs/jfp/entry/python_meets_julia_micro_performance)
# 
# Timings below are for Anaconda 64 bits with Python 3.5.1 and Numba 0.23.1 running on a Windows laptop (Lenovo W520).  Timings on another machine with another Python version can be quite different.

# First, let's load useful extensions and import useful modules

# In[1]:


get_ipython().run_line_magic('load_ext', 'Cython')
get_ipython().run_line_magic('load_ext', 'line_profiler')

import random
import numpy as np
from numba import jit
import sys

if sys.version_info < (3,):
    range = xrange


# # Fibonacci

# ### Naive code
# 
# The base line is gthe one used in Julia benchmarks.  It is a straightforward, frecursive implementation.

# In[2]:


def fib(n):
    if n<2:
        return n
    return fib(n-1)+fib(n-2)


# Sanity check.

# In[3]:


assert fib(20) == 6765


# ### Cython
# 
# Let's try various versions.

# In[4]:


get_ipython().run_cell_magic('cython', '', 'def fib_cython(n):\n    if n<2:\n        return n\n    return fib_cython(n-1)+fib_cython(n-2)\n\ncpdef inline fib_cython_inline(n):\n    if n<2:\n        return n\n    return fib_cython_inline(n-1)+fib_cython_inline(n-2)\n\n\ncpdef long fib_cython_type(long n):\n    if n<2:\n        return n\n    return fib_cython_type(n-1)+fib_cython_type(n-2)\n\ncpdef long fib_cython_type_inline(long n):\n    if n<2:\n        return n\n    return fib_cython_type_inline(n-1)+fib_cython_type_inline(n-2)\n')


# Sanity checks

# In[5]:


assert fib_cython(20) == 6765
assert fib_cython_inline(20) == 6765
assert fib_cython_type(20) == 6765
assert fib_cython_type_inline(20) == 6765


# ### Numba
# 
# Let's try three versions, with and without type.

# In[6]:


from numba import int32, int64
@jit
def fib_numba(n):
    if n<2:
        return n
    return fib_numba(n-1)+fib_numba(n-2)

@jit(int32(int32))
def fib_numba_type32(n):
    if n<2:
        return n
    return fib_numba_type32(n-1)+fib_numba_type32(n-2)


@jit(int64(int64))
def fib_numba_type64(n):
    if n<2:
        return n
    return fib_numba_type64(n-1)+fib_numba_type64(n-2)


# Sanity check

# In[7]:


assert fib_numba(20) == 6765
assert fib_numba_type32(20) == 6765
assert fib_numba_type64(20) == 6765


# ### Using floats

# In[8]:


def ffib(n):
    if n<2.0:
        return n
    return ffib(n-1)+ffib(n-2)


# ### Timings

# In[9]:


get_ipython().run_line_magic('timeit', 'fib(20)')
get_ipython().run_line_magic('timeit', 'fib_cython(20)')
get_ipython().run_line_magic('timeit', 'fib_cython_inline(20)')
get_ipython().run_line_magic('timeit', 'fib_cython_type(20)')
get_ipython().run_line_magic('timeit', 'fib_cython_type_inline(20)')
get_ipython().run_line_magic('timeit', 'fib_numba(20)')
get_ipython().run_line_magic('timeit', 'fib_numba_type32(20)')
get_ipython().run_line_magic('timeit', 'fib_numba_type64(20)')
get_ipython().run_line_magic('timeit', 'ffib(20.0)')


# Let's try a larger one.

# In[10]:


n = 30
get_ipython().run_line_magic('timeit', 'fib(n)')
get_ipython().run_line_magic('timeit', 'fib_cython(n)')
get_ipython().run_line_magic('timeit', 'fib_cython_inline(n)')
get_ipython().run_line_magic('timeit', 'fib_cython_type(n)')
get_ipython().run_line_magic('timeit', 'fib_cython_type_inline(n)')
get_ipython().run_line_magic('timeit', 'fib_numba(n)')
get_ipython().run_line_magic('timeit', 'ffib(n*1.0)')


# ## Sequential Fibonacci
# 
# The recursive call is not the best way to compute Fibonacci numbers.  It is better to accumulate them as follows.

# ### Functools

# In[51]:


from functools import lru_cache as cache

@cache(maxsize=None)
def fib_cache(n):
    if n<2:
        return n
    return fib_cache(n-1)+fib_cache(n-2)


# ### Plain Python

# In[11]:


def fib_seq(n):
    if n < 2:
        return n
    a,b = 1,0
    for i in range(n-1):
        a,b = a+b,a
    return a    


# ### Numba

# In[12]:


@jit
def fib_seq_numba(n):
    if n < 2:
        return n
    a,b = 1,0
    for i in range(n-1):
        a,b = a+b,a
    return a    


# ### Cython

# In[13]:


get_ipython().run_cell_magic('cython', '', '\ndef fib_seq_cython(n):\n    if n < 2:\n        return n\n    a,b = 1,0\n    for i in range(n-1):\n        a,b = a+b,a\n    return a    \n\ncpdef long fib_seq_cython_type(long n):\n    if n < 2:\n        return n\n    cdef long a,b\n    a,b = 1,0\n    for i in range(n-1):\n        a,b = a+b,a\n    return a    \n')


# ### Sanity checks

# In[14]:


assert fib_seq(20) == 6765
assert fib_seq_cython(20) == 6765
assert fib_seq_cython_type(20) == 6765
assert fib_seq_numba(20) == 6765


# ### Timings

# In[52]:


get_ipython().run_line_magic('timeit', 'fib_cache(20)')
get_ipython().run_line_magic('timeit', 'fib_seq(20)')
get_ipython().run_line_magic('timeit', 'fib_seq_cython(20)')
get_ipython().run_line_magic('timeit', 'fib_seq_cython_type(20)')
get_ipython().run_line_magic('timeit', 'fib_seq_numba(20)')


# ### Arbitrary precision check

# In[16]:


fib_seq(100)


# # Quicksort

# ### Pure Python
# 
# Code used in Julia benchmarks

# In[17]:


def qsort_kernel(a, lo, hi):
    i = lo
    j = hi
    while i < hi:
        pivot = a[(lo+hi) // 2]
        while i <= j:
            while a[i] < pivot:
                i += 1
            while a[j] > pivot:
                j -= 1
            if i <= j:
                a[i], a[j] = a[j], a[i]
                i += 1
                j -= 1
        if lo < j:
            qsort_kernel(a, lo, j)
        lo = i
        j = hi
    return a


def benchmark_qsort():
    lst = [ random.random() for i in range(1,5000) ]
    qsort_kernel(lst, 0, len(lst)-1)


# ### Numba

# In[18]:


@jit
def qsort_kernel_numba(a, lo, hi):
    i = lo
    j = hi
    while i < hi:
        pivot = a[(lo+hi) // 2]
        while i <= j:
            while a[i] < pivot:
                i += 1
            while a[j] > pivot:
                j -= 1
            if i <= j:
                a[i], a[j] = a[j], a[i]
                i += 1
                j -= 1
        if lo < j:
            qsort_kernel_numba(a, lo, j)
        lo = i
        j = hi
    return a

def benchmark_qsort_numba():
        lst = [ random.random() for i in range(1,5000) ]
        qsort_kernel_numba(lst, 0, len(lst)-1)


# With Numpy

# In[19]:


@jit
def qsort_kernel_numba_numpy(a, lo, hi):
    i = lo
    j = hi
    while i < hi:
        pivot = a[(lo+hi) // 2]
        while i <= j:
            while a[i] < pivot:
                i += 1
            while a[j] > pivot:
                j -= 1
            if i <= j:
                a[i], a[j] = a[j], a[i]
                i += 1
                j -= 1
        if lo < j:
            qsort_kernel_numba_numpy(a, lo, j)
        lo = i
        j = hi
    return a

def benchmark_qsort_numba_numpy():
        lst = np.random.rand(5000)
        qsort_kernel_numba(lst, 0, len(lst)-1)


# ### Cython

# In[20]:


get_ipython().run_cell_magic('cython', '', 'import numpy as np\nimport cython\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ncdef double[:] \\\nqsort_kernel_cython_numpy_type(double[:] a, \\\n                               long lo, \\\n                               long hi):\n    cdef: \n        long i, j\n        double pivot\n    i = lo\n    j = hi\n    while i < hi:\n        pivot = a[(lo+hi) // 2]\n        while i <= j:\n            while a[i] < pivot:\n                i += 1\n            while a[j] > pivot:\n                j -= 1\n            if i <= j:\n                a[i], a[j] = a[j], a[i]\n                i += 1\n                j -= 1\n        if lo < j:\n            qsort_kernel_cython_numpy_type(a, lo, j)\n        lo = i\n        j = hi\n    return a\n\ndef benchmark_qsort_numpy_cython():\n    lst = np.random.rand(5000)\n    qsort_kernel_cython_numpy_type(lst, 0, len(lst)-1)\n    \n')


# In[21]:


get_ipython().run_line_magic('timeit', 'benchmark_qsort()')
get_ipython().run_line_magic('timeit', 'benchmark_qsort_numba()')
get_ipython().run_line_magic('timeit', 'benchmark_qsort_numba_numpy()')
get_ipython().run_line_magic('timeit', 'benchmark_qsort_numpy_cython()')


# ### Built-in sort

# In[22]:


def benchmark_sort():
    lst = [ random.random() for i in range(1,5000) ]
    lst.sort()

def benchmark_sort_numpy():
    lst = np.random.rand(5000)
    np.sort(lst)


# In[23]:


get_ipython().run_line_magic('timeit', 'benchmark_sort()')
get_ipython().run_line_magic('timeit', 'benchmark_sort_numpy()')


# # Random mat stat

# ### Baseline
# 
# Used in Julia benchmarks

# In[24]:


def randmatstat(t):
    n = 5
    v = np.zeros(t)
    w = np.zeros(t)
    for i in range(1,t):
        a = np.random.randn(n, n)
        b = np.random.randn(n, n)
        c = np.random.randn(n, n)
        d = np.random.randn(n, n)
        P = np.matrix(np.hstack((a, b, c, d)))
        Q = np.matrix(np.vstack((np.hstack((a, b)), np.hstack((c, d)))))
        v[i] = np.trace(np.linalg.matrix_power(np.transpose(P)*P, 4))
        w[i] = np.trace(np.linalg.matrix_power(np.transpose(Q)*Q, 4))
    return (np.std(v)/np.mean(v), np.std(w)/np.mean(w))


# ### Speeding it up

# In[25]:


def my_power(a):
    a = np.dot(a.T,a)
    a = np.dot(a,a)
    a = np.dot(a,a)
    return a
    
def randmatstat_fast(t):
    n = 5
    v = np.zeros(t)
    w = np.zeros(t)
    for i in range(1,t):
        a = np.random.randn(n, n)
        b = np.random.randn(n, n)
        c = np.random.randn(n, n)
        d = np.random.randn(n, n)
        P = np.hstack((a, b, c, d))
        Q = np.vstack((np.hstack((a, b)), np.hstack((c, d))))
        v[i] = np.trace(my_power(P))
        w[i] = np.trace(my_power(Q))
    return (np.std(v)/np.mean(v), np.std(w)/np.mean(w))


# ### Timings

# In[26]:


get_ipython().run_line_magic('timeit', 'randmatstat(1000)')
get_ipython().run_line_magic('timeit', 'randmatstat_fast(1000)')


# # Randmatmul

# In[27]:


def randmatmul(n):
    A = np.random.rand(n,n)
    B = np.random.rand(n,n)
    return np.dot(A,B)


# In[28]:


get_ipython().run_line_magic('timeit', 'randmatmul(1000)')


# # Mandelbrot

# ### Baseline
# The baseline used in Julia benchmarks

# In[29]:


def mandel(z):
    maxiter = 80
    c = z
    for n in range(maxiter):
        if abs(z) > 2:
            return n
        z = z*z + c
    return maxiter

def mandelperf():
    r1 = np.linspace(-2.0, 0.5, 26)
    r2 = np.linspace(-1.0, 1.0, 21)
    return [mandel(complex(r, i)) for r in r1 for i in r2]


# In[30]:


assert sum(mandelperf()) == 14791


# ### Numba

# In[31]:


@jit
def mandel_numba(z):
    maxiter = 80
    c = z
    for n in range(maxiter):
        if abs(z) > 2:
            return n
        z = z*z + c
    return maxiter

def mandelperf_numba():
    r1 = np.linspace(-2.0, 0.5, 26)
    r2 = np.linspace(-1.0, 1.0, 21)
    c3 = [complex(r,i) for r in r1 for i in r2]
    return [mandel_numba(c) for c in c3]


# In[32]:


assert sum(mandelperf_numba()) == 14791


# ### Cython

# In[33]:


get_ipython().run_cell_magic('cython', '', '\nimport numpy as np\ncimport numpy as np\n\ncdef long mandel_cython_type(np.complex z):\n    cdef long maxiter, n\n    cdef np.complex c\n    maxiter = 80\n    c = z\n    for n in range(maxiter):\n        if abs(z) > 2:\n            return n\n        z = z*z + c\n    return maxiter\n\ncpdef mandelperf_cython_type():\n    cdef double[:] r1,r2\n    r1 = np.linspace(-2.0, 0.5, 26)\n    r2 = np.linspace(-1.0, 1.0, 21)\n    return [mandel_cython_type(complex(r, i)) for r in r1 for i in r2]\n')


# In[34]:


assert sum(mandelperf_cython_type()) == 14791


# In[35]:


get_ipython().run_line_magic('timeit', 'mandelperf()')
get_ipython().run_line_magic('timeit', 'mandelperf_cython_type()')
get_ipython().run_line_magic('timeit', 'mandelperf_numba()')


# ### Profiling

# In[36]:


get_ipython().run_line_magic('lprun', '-s -f mandelperf_numba mandelperf_numba()')


# ### Numpy

# In[53]:


@jit
def mandel_numba(z):
    maxiter = 80
    c = z
    for n in range(maxiter):
        if abs(z) > 2:
            return n
        z = z*z + c
    return maxiter

@jit
def mandelperf_numba_mesh():
    width = 26
    height = 21
    r1 = np.linspace(-2.0, 0.5, width)
    r2 = np.linspace(-1.0, 1.0, height)
    mandel_set = np.empty((width,height), dtype=int)
    for i in range(width):
        for j in range(height):
            mandel_set[i,j] = mandel_numba(r1[i] + 1j*r2[j])
    return mandel_set


# In[54]:


assert np.sum(mandelperf_numba_mesh()) == 14791


# In[55]:


get_ipython().run_line_magic('timeit', 'mandelperf_numba_mesh()')


# An even faster code is presented in [How To Quickly Compute The Mandelbrot Set In Python](https://www.ibm.com/developerworks/community/blogs/jfp/entry/How_To_Compute_Mandelbrodt_Set_Quickly?lang=en)

# # Pisum

# In[43]:


def pisum(t):
    sum = 0.0
    for j in range(1, 501):
        sum = 0.0
        for k in range(1, t+1):
            sum += 1.0/(k*k)
    return sum

@jit
def pisum_numba(t):
    sum = 0.0
    for j in range(1, 501):
        sum = 0.0
        for k in range(1, t+1):
            sum += 1.0/(k*k)
    return sum


# In[44]:


get_ipython().run_cell_magic('cython', '', '\nimport cython\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ncpdef double pisum_cython(int t):\n    cdef:\n        double sum = 0.0\n        int j,k\n    for j in range(1, 501):\n        sum = 0.0\n        for k in range(1, t+1):\n            sum += 1.0/(k*k)\n    return sum\n')


# In[45]:


assert abs(pisum(10000)-1.644834071848065) < 1e-6
assert abs(pisum_numba(10000)-1.644834071848065) < 1e-6
assert abs(pisum_cython(10000)-1.644834071848065) < 1e-6


# In[46]:


get_ipython().run_line_magic('timeit', 'pisum(10000)')
get_ipython().run_line_magic('timeit', 'pisum_numba(10000)')
get_ipython().run_line_magic('timeit', 'pisum_cython(10000)')


# ## Parse int

# In[19]:


def parse_int():
    for i in range(1,1000):
        n = random.randint(0,2**32-1)
        s = hex(n)
        
        # required for Python 2.7 on 64 bits machine
        if s[-1]=='L':
            s = s[0:-1]
            
        m = int(s,16)
        assert m == n
        
def parse_int_numpy():
    for i in range(1,1000):
        n = np.random.randint(0,2**31-1)
        s = hex(n)
        
        # required for Python 2.7 on 64 bits machine
        if s[-1]=='L':
            s = s[0:-1]
            
        m = int(s,16)
        assert m == n
        
def parse_int_opt():
    for i in range(1,1000):
        n = random.randint(0,2**32-1)
        s = hex(n)
        
        # required for Python 2.7 on 64 bits machine
        if s.endswith('L'):
            s = s[0:-1]
            
        m = int(s,16)
        assert m == n
        
def parse_int1():
    for i in range(1,1000):
        n = random.randint(0,2**32-1)
        s = hex(n)
        m = int(s,16)
        assert m == n
        
def parse_int1_numpy():
    for i in range(1,1000):
        n = np.random.randint(0,2**31-1)
        s = hex(n)
        m = int(s,16)
        assert m == n
        
@jit
def parse_numba():
    for i in range(1,1000):
        n = random.randint(0,2**32-1)
        s = hex(n)
        m = int(s,16)
        assert m == n

@jit
def parse_numba_numpy():
    for i in range(1,1000):
        n = np.random.randint(0,2**31-1)
        s = hex(n)
        m = int(s,16)
        assert m == n


# Int in C are up to 2^31-1, hence we must use this as the limit when using Cython or Numpy

# In[21]:


get_ipython().run_cell_magic('cython', '', 'import random\nimport cython\ncimport cython\nimport numpy as np\ncimport numpy as np\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ncpdef parse_int_cython():\n    cdef:\n        int i,n,m\n    for i in range(1,1000):\n        n = random.randint(0,2**31-1)\n        m = int(hex(n),16)\n        assert m == n\n        \n@cython.boundscheck(False)\n@cython.wraparound(False)\ncpdef parse_int_cython_numpy():\n    cdef:\n        int i,n,m\n    for i in range(1,1000):\n        n = np.random.randint(0,2**31-1)\n        m = int(hex(n),16)\n        assert m == n\n')


# In[22]:


get_ipython().run_line_magic('timeit', 'parse_int()')
get_ipython().run_line_magic('timeit', 'parse_int_numpy()')
get_ipython().run_line_magic('timeit', 'parse_int_opt()')
get_ipython().run_line_magic('timeit', 'parse_int1()')
get_ipython().run_line_magic('timeit', 'parse_int1_numpy()')
get_ipython().run_line_magic('timeit', 'parse_numba()')
get_ipython().run_line_magic('timeit', 'parse_numba_numpy()')
get_ipython().run_line_magic('timeit', 'parse_int_cython()')
get_ipython().run_line_magic('timeit', 'parse_int_cython_numpy()')


# ### Vectorized operation

# In[61]:


get_ipython().run_line_magic('lprun', '-s -f parse_int parse_int()')


# In[25]:


import numpy as np
def parse_int_vec():
    n = np.random.randint(2^31-1,size=1000)
    for i in range(1,1000):
        ni = n[i]
        s = hex(ni)
        m = int(s,16)
        assert m == ni
      
@jit
def parse_int_vec_numba():
    n = np.random.randint(2^31-1,size=1000)
    for i in range(1,1000):
        ni = n[i]
        s = hex(ni)
        m = int(s,16)
        assert m == ni


# In[26]:


vhex = np.vectorize(hex)
vint = np.vectorize(int)

def parse_int_numpy():
    n = np.random.randint(0,2**31-1,1000)
    s = vhex(n)
    m = vint(s,16)
    np.all(m == n)
    return s

@jit
def parse_int_numpy_numba():
    n = np.random.randint(0,2**31-1,1000)
    s = vhex(n)
    m = vint(s,16)
    np.all(m == n)


# In[27]:


get_ipython().run_cell_magic('cython', '', 'import numpy as np\nimport cython\ncimport numpy\ncimport cython\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ncpdef parse_int_vec_cython():\n    cdef:\n        int i,m\n        int[:] n\n    n = np.random.randint(0,2**31-1,1000)\n    for i in range(1,1000):\n        m = int(hex(n[i]),16)\n        assert m == n[i]\n')


# In[28]:


get_ipython().run_line_magic('timeit', 'parse_int_vec()')
get_ipython().run_line_magic('timeit', 'parse_int_vec_numba()')
get_ipython().run_line_magic('timeit', 'parse_int_numpy()')
get_ipython().run_line_magic('timeit', 'parse_int_numpy_numba()')
get_ipython().run_line_magic('timeit', 'parse_int_vec_cython()')


# That's it!