#!/usr/bin/env python
# coding: utf-8

# # Lecture 6B - Tools for high-performance computing applications

# J.R. Johansson (jrjohansson at gmail.com)
# 
# The latest version of this [IPython notebook](http://ipython.org/notebook.html) lecture is available at [http://github.com/jrjohansson/scientific-python-lectures](http://github.com/jrjohansson/scientific-python-lectures).
# 
# The other notebooks in this lecture series are indexed at [http://jrjohansson.github.io](http://jrjohansson.github.io).

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt


# ## multiprocessing

# Python has a built-in process-based library for concurrent computing, called `multiprocessing`. 

# In[2]:


import multiprocessing
import os
import time
import numpy


# In[3]:


def task(args):
    print("PID =", os.getpid(), ", args =", args)
    
    return os.getpid(), args


# In[4]:


task("test")


# In[5]:


pool = multiprocessing.Pool(processes=4)


# In[6]:


result = pool.map(task, [1,2,3,4,5,6,7,8])


# In[7]:


result


# The multiprocessing package is very useful for highly parallel tasks that do not need to communicate with each other, other than when sending the initial data to the pool of processes and when and collecting the results. 

# ## IPython parallel

# IPython includes a very interesting and versatile parallel computing environment, which is very easy to use. It builds on the concept of ipython engines and controllers, that one can connect to and submit tasks to. To get started using this framework for parallel computing, one first have to start up an IPython cluster of engines. The easiest way to do this is to use the `ipcluster` command,
# 
#     $ ipcluster start -n 4
# 
# Or, alternatively, from the "Clusters" tab on the IPython notebook dashboard page. This will start 4 IPython engines on the current host, which is useful for multicore systems. It is also possible to setup IPython clusters that spans over many nodes in a computing cluster. For more information about possible use cases, see the official documentation [Using IPython for parallel computing](http://ipython.org/ipython-doc/dev/parallel/).
# 
# To use the IPython cluster in our Python programs or notebooks, we start by creating an instance of `IPython.parallel.Client`:

# In[8]:


from IPython.parallel import Client


# In[9]:


cli = Client()


# Using the 'ids' attribute we can retrieve a list of ids for the IPython engines in the cluster:

# In[10]:


cli.ids


# Each of these engines are ready to execute tasks. We can selectively run code on individual engines:

# In[11]:


def getpid():
    """ return the unique ID of the current process """
    import os
    return os.getpid()


# In[12]:


# first try it on the notebook process
getpid()


# In[13]:


# run it on one of the engines
cli[0].apply_sync(getpid)


# In[14]:


# run it on ALL of the engines at the same time
cli[:].apply_sync(getpid)


# We can use this cluster of IPython engines to execute tasks in parallel. The easiest way to dispatch a function to different engines is to define the function with the decorator:
# 
#     @view.parallel(block=True)
# 
# Here, `view` is supposed to be the engine pool which we want to dispatch the function (task). Once our function is defined this way we can dispatch it to the engine using the `map` method in the resulting class (in Python, a decorator is a language construct which automatically wraps the function into another function or a class).
# 
# To see how all this works, lets look at an example:

# In[15]:


dview = cli[:]


# In[16]:


@dview.parallel(block=True)
def dummy_task(delay):
    """ a dummy task that takes 'delay' seconds to finish """
    import os, time

    t0 = time.time()
    pid = os.getpid()
    time.sleep(delay)
    t1 = time.time()
    
    return [pid, t0, t1]


# In[17]:


# generate random delay times for dummy tasks
delay_times = numpy.random.rand(4)


# Now, to map the function `dummy_task` to the random delay time data, we use the `map` method in `dummy_task`:

# In[18]:


dummy_task.map(delay_times)


# Let's do the same thing again with many more tasks and visualize how these tasks are executed on different IPython engines:

# In[19]:


def visualize_tasks(results):
    res = numpy.array(results)
    fig, ax = plt.subplots(figsize=(10, res.shape[1]))
    
    yticks = []
    yticklabels = []
    tmin = min(res[:,1])
    for n, pid in enumerate(numpy.unique(res[:,0])):
        yticks.append(n)
        yticklabels.append("%d" % pid)
        for m in numpy.where(res[:,0] == pid)[0]:
            ax.add_patch(plt.Rectangle((res[m,1] - tmin, n-0.25),
                         res[m,2] - res[m,1], 0.5, color="green", alpha=0.5))
        
    ax.set_ylim(-.5, n+.5)
    ax.set_xlim(0, max(res[:,2]) - tmin + 0.)
    ax.set_yticks(yticks)
    ax.set_yticklabels(yticklabels)
    ax.set_ylabel("PID")
    ax.set_xlabel("seconds")


# In[20]:


delay_times = numpy.random.rand(64)


# In[21]:


result = dummy_task.map(delay_times)
visualize_tasks(result)


# That's a nice and easy parallelization! We can see that we utilize all four engines quite well.
# 
# But one short coming so far is that the tasks are not load balanced, so one engine might be idle while others still have more tasks to work on.
# 
# However, the IPython parallel environment provides a number of alternative "views" of the engine cluster, and there is a view that provides load balancing as well (above we have used the "direct view", which is why we called it "dview").
# 
# To obtain a load balanced view we simply use the `load_balanced_view` method in the engine cluster client instance `cli`:

# In[22]:


lbview = cli.load_balanced_view()


# In[23]:


@lbview.parallel(block=True)
def dummy_task_load_balanced(delay):
    """ a dummy task that takes 'delay' seconds to finish """
    import os, time

    t0 = time.time()
    pid = os.getpid()
    time.sleep(delay)
    t1 = time.time()
    
    return [pid, t0, t1]


# In[24]:


result = dummy_task_load_balanced.map(delay_times)
visualize_tasks(result)


# In the example above we can see that the engine cluster is a bit more efficiently used, and the time to completion is shorter than in the previous example.

# ### Further reading

# There are many other ways to use the IPython parallel environment. The official documentation has a nice guide:
# 
# * http://ipython.org/ipython-doc/dev/parallel/

# ## MPI

# When more communication between processes is required, sophisticated solutions such as MPI and OpenMP are often needed. MPI is process based parallel processing library/protocol, and can be used in Python programs through the `mpi4py` package:
# 
# http://mpi4py.scipy.org/
# 
# To use the `mpi4py` package we include `MPI` from `mpi4py`:
# 
#     from mpi4py import MPI
# 
# A MPI python program must be started using the `mpirun -n N` command, where `N` is the number of processes that should be included in the process group.
# 
# Note that the IPython parallel environment also has support for MPI, but to begin with we will use `mpi4py` and the `mpirun` in the follow examples.

# ### Example 1

# In[25]:


get_ipython().run_cell_magic('file', 'mpitest.py', '\nfrom mpi4py import MPI\n\ncomm = MPI.COMM_WORLD\nrank = comm.Get_rank()\n\nif rank == 0:\n   data = [1.0, 2.0, 3.0, 4.0]\n   comm.send(data, dest=1, tag=11)\nelif rank == 1:\n   data = comm.recv(source=0, tag=11)\n    \nprint "rank =", rank, ", data =", data\n')


# In[26]:


get_ipython().system('mpirun -n 2 python mpitest.py')


# ### Example 2

# Send a numpy array from one process to another:

# In[27]:


get_ipython().run_cell_magic('file', 'mpi-numpy-array.py', '\nfrom mpi4py import MPI\nimport numpy\n\ncomm = MPI.COMM_WORLD\nrank = comm.Get_rank()\n\nif rank == 0:\n   data = numpy.random.rand(10)\n   comm.Send(data, dest=1, tag=13)\nelif rank == 1:\n   data = numpy.empty(10, dtype=numpy.float64)\n   comm.Recv(data, source=0, tag=13)\n    \nprint "rank =", rank, ", data =", data\n')


# In[28]:


get_ipython().system('mpirun -n 2 python mpi-numpy-array.py')


# ### Example 3: Matrix-vector multiplication

# In[29]:


# prepare some random data
N = 16
A = numpy.random.rand(N, N)
numpy.save("random-matrix.npy", A)
x = numpy.random.rand(N)
numpy.save("random-vector.npy", x)


# In[30]:


get_ipython().run_cell_magic('file', 'mpi-matrix-vector.py', '\nfrom mpi4py import MPI\nimport numpy\n\ncomm = MPI.COMM_WORLD\nrank = comm.Get_rank()\np = comm.Get_size()\n\ndef matvec(comm, A, x):\n    m = A.shape[0] / p\n    y_part = numpy.dot(A[rank * m:(rank+1)*m], x)\n    y = numpy.zeros_like(x)\n    comm.Allgather([y_part,  MPI.DOUBLE], [y, MPI.DOUBLE])\n    return y\n\nA = numpy.load("random-matrix.npy")\nx = numpy.load("random-vector.npy")\ny_mpi = matvec(comm, A, x)\n\nif rank == 0:\n    y = numpy.dot(A, x)\n    print(y_mpi)\n    print "sum(y - y_mpi) =", (y - y_mpi).sum()\n')


# In[31]:


get_ipython().system('mpirun -n 4 python mpi-matrix-vector.py')


# ### Example 4: Sum of the elements in a vector

# In[32]:


# prepare some random data
N = 128
a = numpy.random.rand(N)
numpy.save("random-vector.npy", a)


# In[33]:


get_ipython().run_cell_magic('file', 'mpi-psum.py', '\nfrom mpi4py import MPI\nimport numpy as np\n\ndef psum(a):\n    r = MPI.COMM_WORLD.Get_rank()\n    size = MPI.COMM_WORLD.Get_size()\n    m = len(a) / size\n    locsum = np.sum(a[r*m:(r+1)*m])\n    rcvBuf = np.array(0.0, \'d\')\n    MPI.COMM_WORLD.Allreduce([locsum, MPI.DOUBLE], [rcvBuf, MPI.DOUBLE], op=MPI.SUM)\n    return rcvBuf\n\na = np.load("random-vector.npy")\ns = psum(a)\n\nif MPI.COMM_WORLD.Get_rank() == 0:\n    print "sum =", s, ", numpy sum =", a.sum()\n')


# In[34]:


get_ipython().system('mpirun -n 4 python mpi-psum.py')


# ### Further reading

# * http://mpi4py.scipy.org
# 
# * http://mpi4py.scipy.org/docs/usrman/tutorial.html
# 
# * https://computing.llnl.gov/tutorials/mpi/

# ## OpenMP

# What about OpenMP? OpenMP is a standard and widely used thread-based parallel API that unfortunately is **not** useful directly in Python. The reason is that the CPython implementation use a global interpreter lock, making it impossible to simultaneously run several Python threads. Threads are therefore not useful for parallel computing in Python, unless it is only used to wrap compiled code that do the OpenMP parallelization (Numpy can do something like that). 
# 
# This is clearly a limitation in the Python interpreter, and as a consequence all parallelization in Python must use processes (not threads).
# 
# However, there is a way around this that is not that painful. When calling out to compiled code the GIL is released, and it is possible to write Python-like code in Cython where we can selectively release the GIL and do OpenMP computations. 

# In[35]:


N_core = multiprocessing.cpu_count()

print("This system has %d cores" % N_core)


# Here is a simple example that shows how OpenMP can be used via cython:

# In[36]:


get_ipython().run_line_magic('load_ext', 'cythonmagic')


# In[37]:


get_ipython().run_cell_magic('cython', '-f -c-fopenmp --link-args=-fopenmp -c-g', '\ncimport cython\ncimport numpy\nfrom cython.parallel import prange, parallel\ncimport openmp\n\ndef cy_openmp_test():\n\n    cdef int n, N\n\n    # release GIL so that we can use OpenMP\n    with nogil, parallel():\n        N = openmp.omp_get_num_threads()\n        n = openmp.omp_get_thread_num()\n        with gil:\n            print("Number of threads %d: thread number %d" % (N, n))\n')


# In[38]:


cy_openmp_test()


# ### Example: matrix vector multiplication

# In[39]:


# prepare some random data
N = 4 * N_core

M = numpy.random.rand(N, N)
x = numpy.random.rand(N)
y = numpy.zeros_like(x)


# Let's first look at a simple implementation of matrix-vector multiplication in Cython:

# In[40]:


get_ipython().run_cell_magic('cython', '', '\ncimport cython\ncimport numpy\nimport numpy\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ndef cy_matvec(numpy.ndarray[numpy.float64_t, ndim=2] M, \n              numpy.ndarray[numpy.float64_t, ndim=1] x, \n              numpy.ndarray[numpy.float64_t, ndim=1] y):\n\n    cdef int i, j, n = len(x)\n\n    for i from 0 <= i < n:\n        for j from 0 <= j < n:\n            y[i] += M[i, j] * x[j]\n            \n    return y\n')


# In[41]:


# check that we get the same results
y = numpy.zeros_like(x)
cy_matvec(M, x, y)
numpy.dot(M, x) - y


# In[42]:


get_ipython().run_line_magic('timeit', 'numpy.dot(M, x)')


# In[43]:


get_ipython().run_line_magic('timeit', 'cy_matvec(M, x, y)')


# The Cython implementation here is a bit slower than numpy.dot, but not by much, so if we can use multiple cores with OpenMP it should be possible to beat the performance of numpy.dot.

# In[44]:


get_ipython().run_cell_magic('cython', '-f -c-fopenmp --link-args=-fopenmp -c-g', '\ncimport cython\ncimport numpy\nfrom cython.parallel import parallel\ncimport openmp\n\n@cython.boundscheck(False)\n@cython.wraparound(False)\ndef cy_matvec_omp(numpy.ndarray[numpy.float64_t, ndim=2] M, \n                  numpy.ndarray[numpy.float64_t, ndim=1] x, \n                  numpy.ndarray[numpy.float64_t, ndim=1] y):\n\n    cdef int i, j, n = len(x), N, r, m\n\n    # release GIL, so that we can use OpenMP\n    with nogil, parallel():\n        N = openmp.omp_get_num_threads()\n        r = openmp.omp_get_thread_num()\n        m = n / N\n        \n        for i from 0 <= i < m:\n            for j from 0 <= j < n:\n                y[r * m + i] += M[r * m + i, j] * x[j]\n\n    return y\n')


# In[45]:


# check that we get the same results
y = numpy.zeros_like(x)
cy_matvec_omp(M, x, y)
numpy.dot(M, x) - y


# In[46]:


get_ipython().run_line_magic('timeit', 'numpy.dot(M, x)')


# In[47]:


get_ipython().run_line_magic('timeit', 'cy_matvec_omp(M, x, y)')


# Now, this implementation is much slower than numpy.dot for this problem size, because of overhead associated with OpenMP and threading, etc. But let's look at the how the different implementations compare with larger matrix sizes:

# In[48]:


N_vec  = numpy.arange(25, 2000, 25) * N_core


# In[49]:


duration_ref = numpy.zeros(len(N_vec))
duration_cy = numpy.zeros(len(N_vec))
duration_cy_omp = numpy.zeros(len(N_vec))

for idx, N in enumerate(N_vec):
    
    M = numpy.random.rand(N, N)
    x = numpy.random.rand(N)
    y = numpy.zeros_like(x)
    
    t0 = time.time()
    numpy.dot(M, x)
    duration_ref[idx] = time.time() - t0
    
    t0 = time.time()
    cy_matvec(M, x, y)
    duration_cy[idx] = time.time() - t0
    
    t0 = time.time()
    cy_matvec_omp(M, x, y)
    duration_cy_omp[idx] = time.time() - t0


# In[50]:


fig, ax = plt.subplots(figsize=(12, 6))

ax.loglog(N_vec, duration_ref, label='numpy')
ax.loglog(N_vec, duration_cy, label='cython')
ax.loglog(N_vec, duration_cy_omp, label='cython+openmp')

ax.legend(loc=2)
ax.set_yscale("log")
ax.set_ylabel("matrix-vector multiplication duration")
ax.set_xlabel("matrix size");


# For large problem sizes the the cython+OpenMP implementation is faster than numpy.dot.

# With this simple implementation, the speedup for large problem sizes is about:

# In[51]:


((duration_ref / duration_cy_omp)[-10:]).mean()


#  Obviously one could do a better job with more effort, since the theoretical limit of the speed-up is:

# In[52]:


N_core


# ### Further reading

#  * http://openmp.org
#  * http://docs.cython.org/src/userguide/parallelism.html

# ## OpenCL

# OpenCL is an API for heterogenous computing, for example using GPUs for numerical computations. There is a python package called `pyopencl` that allows OpenCL code to be compiled, loaded and executed on the compute units completely from within Python. This is a nice way to work with OpenCL, because the time-consuming computations should be done on the compute units in compiled code, and in this Python only server as a control language.

# In[53]:


get_ipython().run_cell_magic('file', 'opencl-dense-mv.py', '\nimport pyopencl as cl\nimport numpy\nimport time\n\n# problem size\nn = 10000\n\n# platform\nplatform_list = cl.get_platforms()\nplatform = platform_list[0]\n\n# device\ndevice_list = platform.get_devices()\ndevice = device_list[0]\n\nif False:\n    print("Platform name:" + platform.name)\n    print("Platform version:" + platform.version)\n    print("Device name:" + device.name)\n    print("Device type:" + cl.device_type.to_string(device.type))\n    print("Device memory: " + str(device.global_mem_size//1024//1024) + \' MB\')\n    print("Device max clock speed:" + str(device.max_clock_frequency) + \' MHz\')\n    print("Device compute units:" + str(device.max_compute_units))\n\n# context\nctx = cl.Context([device]) # or we can use cl.create_some_context()\n\n# command queue\nqueue = cl.CommandQueue(ctx)\n\n# kernel\nKERNEL_CODE = """\n//\n// Matrix-vector multiplication: r = m * v\n//\n#define N %(mat_size)d\n__kernel\nvoid dmv_cl(__global float *m, __global float *v, __global float *r)\n{\n    int i, gid = get_global_id(0);\n    \n    r[gid] = 0;\n    for (i = 0; i < N; i++)\n    {\n        r[gid] += m[gid * N + i] * v[i];\n    }\n}\n"""\n\nkernel_params = {"mat_size": n}\nprogram = cl.Program(ctx, KERNEL_CODE % kernel_params).build()\n\n# data\nA = numpy.random.rand(n, n)\nx = numpy.random.rand(n, 1)\n\n# host buffers\nh_y = numpy.empty(numpy.shape(x)).astype(numpy.float32)\nh_A = numpy.real(A).astype(numpy.float32)\nh_x = numpy.real(x).astype(numpy.float32)\n\n# device buffers\nmf = cl.mem_flags\nd_A_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_A)\nd_x_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_x)\nd_y_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=h_y.nbytes)\n\n# execute OpenCL code\nt0 = time.time()\nevent = program.dmv_cl(queue, h_y.shape, None, d_A_buf, d_x_buf, d_y_buf)\nevent.wait()\ncl.enqueue_copy(queue, h_y, d_y_buf)\nt1 = time.time()\n\nprint "opencl elapsed time =", (t1-t0)\n\n# Same calculation with numpy\nt0 = time.time()\ny = numpy.dot(h_A, h_x)\nt1 = time.time()\n\nprint "numpy elapsed time =", (t1-t0)\n\n# see if the results are the same\nprint "max deviation =", numpy.abs(y-h_y).max()\n')


# In[54]:


get_ipython().system('python opencl-dense-mv.py')


# ### Further reading

# * http://mathema.tician.de/software/pyopencl

# ## Versions

# In[55]:


get_ipython().run_line_magic('load_ext', 'version_information')

get_ipython().run_line_magic('version_information', 'numpy, mpi4py, Cython')