#!/usr/bin/env python
# coding: utf-8

# # Benchmarks

# In[1]:


# NBVAL_IGNORE_OUTPUT
get_ipython().run_line_magic('load_ext', 'watermark')
import os
import qutip
import numpy as np
from numpy import pi
import matplotlib
import matplotlib.pylab as plt
import newtonprop
import snakeviz
import numba
get_ipython().run_line_magic('watermark', '-v --iversions')


# While only available interactively, `snakeviz` is a very good way to look at profiling data:

# In[2]:


get_ipython().run_line_magic('load_ext', 'snakeviz')


# The benchmarks are based on the Examples, so we rerun the entire Example notebook in this context:

# In[3]:


get_ipython().run_cell_magic('capture', '', '%run example.ipynb\n')


# ## Propagation runtime

# In[4]:


tlist = np.linspace(0, 10, 100)


# First, we measure low long the propagation with QuTiP takes. This happens mostly in compiled code, so it is pretty fast.

# In[5]:


get_ipython().run_cell_magic('timeit', '', 'qutip.mesolve(L, rho0, tlist)\n')


# In[6]:


get_ipython().run_cell_magic('timeit', '', 'propagate_expm(L, rho0, tlist)\n')


# The Newton propagator, being a reference implementation, is implemented in pure Python, and is several orders of magnitude slower. To make the comparison fair, we limit the precision to $10^{-8}$, which is roughly the precision of `mesolve`.

# In[7]:


get_ipython().run_cell_magic('timeit', '', 'propagate(L, rho0, tlist, zero_qutip, norm_qutip, inner_qutip, tol=1e-8)\n')


# When using lower-level data types, things get considerably faster:

# In[8]:


get_ipython().run_cell_magic('timeit', '', 'propagate(apply_cythonized_L, rho0_data, tlist, zero_vectorized, norm_vectorized, inner_vectorized, tol=1e-8)\n')


# In[9]:


get_ipython().run_cell_magic('timeit', '', 'propagate(L_vectorized, rho0_vectorized, tlist, zero_vectorized, norm_vectorized, inner_vectorized, tol=1e-8)\n')


# ## Profiling

# We can profile the how much time is spent in the various routines, comparing `mesolve` and different variations of the Newton propagator. See https://docs.python.org/3/library/profile.html#instant-user-s-manual for the meaning of the colums.

# ### mesolve

# First, we look the QuTiP's `mesolve`:

# In[10]:


stats = get_ipython().run_line_magic('prun', '-q -r qutip.mesolve(L, rho0, tlist);')


# We can look at which top-level routines we spent the most time in *cumulativly*, that is including sub-calls:

# In[11]:


stats.sort_stats('cumtime').print_stats(10);


# Or, the bottom-level routines where we *actually* spent time:

# In[12]:


stats.sort_stats('tottime').print_stats(10);


# This is dominated by the ODE solver and sparse matrix operations

# If we're working interactively, we could use `snakeviz` to analyze further details:

# In[13]:


#%snakeviz qutip.mesolve(L, rho0, tlist)


# ### qutip-propagation

# Next, we look at the Newton propagator operating on high-level qutip objects:

# In[14]:


#%snakeviz propagate(L, rho0, tlist, zero_qutip, norm_qutip, inner_qutip, tol=1e-8)


# In[15]:


stats = get_ipython().run_line_magic('prun', '-q -r propagate(L, rho0, tlist, zero_qutip, norm_qutip, inner_qutip, tol=1e-8);')


# In[16]:


stats.sort_stats('cumtime').print_stats(10);


# In[17]:


stats.sort_stats('tottime').print_stats(10);


# ### cythonized qutip-propagation

# Lastly, we can look at the more efficient propagation using a cythonized application of the QuTiP objects:

# In[18]:


#%snakeviz propagate(apply_cythonized_L, rho0_data, tlist, zero_vectorized, norm_vectorized, inner_vectorized, tol=1e-8)


# In[19]:


stats = get_ipython().run_line_magic('prun', '-q -r propagate(apply_cythonized_L, rho0_data, tlist, zero_vectorized, norm_vectorized, inner_vectorized, tol=1e-8);')


# In[20]:


stats.sort_stats('cumtime').print_stats(10);


# In[21]:


stats.sort_stats('tottime').print_stats(10);


# We now see that the (Python-level) arnoldi and step routines of Newton come out as the bottle neck, as the application of the Liouvillian has become efficient level (running at C speed).

# Note that the Newton implementation, in particular the `_extend_leja` function has been sped up through the use of numba. Without that, `_extend_leja` would dominate.

# ## Number of function evaluations

# Since the Newton propagator is ultimately still limited by being implemented in Python, it is more fair to measure the runtime in terms of the number of average number of applications of the Liouvillian per time step. This is under the assumption that in an efficient implementation (and for large Hilbert spaces), this is the dominating factor.
# 
# The number of applications depends on the chosen precision and on the length of the time step: longer time steps tend do be more efficient, as only then we're in a regime where the fast convergence of the Newton series kicks in.

# We construct a dummy `Qobj` that counts its own applications to a state:

# In[22]:


class CountingQobj(qutip.Qobj):
    def __init__(self, *args, **kwargs):
        self.counter = 0
        super().__init__(*args, **kwargs)
    def __mul__(self, other):
        if isinstance(other, qutip.Qobj):
            self.counter += 1
        return super().__mul__(other)


# In[23]:


def count_applications_newton(nt, tol=1e-8):
    tlist = np.linspace(0, 10, nt)
    L_count = CountingQobj(L)
    propagate(L_count, rho0, tlist, zero_qutip, norm_qutip, inner_qutip, tol=1e-8)
    return L_count.counter / len(tlist)


# In[24]:


count_applications_newton(nt=10)


# In[25]:


count_applications_newton(nt=100)


# To compare this to the average number of applications in `mesolve`, we use a trimmed-down version of the `mesolve` routine:

# In[26]:


import scipy
from qutip.solver import Options
from qutip.superoperator import mat2vec
from qutip.cy.spmatfuncs import cy_ode_rhs
from qutip.mesolve import _generic_ode_solve
from qutip.ui.progressbar import BaseProgressBar


def mesolve(L, rho0, tlist):
    opt = Options()

    def func(t, rho, data, ind, ptr):
        func.counter += 1
        return cy_ode_rhs(t, rho, data, ind, ptr)

    func.counter = 0

    r = scipy.integrate.ode(func)
    r.set_f_params(L.data.data, L.data.indices, L.data.indptr)
    r.set_integrator('zvode', method=opt.method, order=opt.order,
                     atol=opt.atol, rtol=opt.rtol, nsteps=opt.nsteps,
                     first_step=opt.first_step, min_step=opt.min_step,
                     max_step=opt.max_step)
    initial_vector = mat2vec(rho0.full()).ravel('F')
    r.set_initial_value(initial_vector, tlist[0])
    dt = tlist[1] - tlist[0]
    for step in range(len(tlist)-1):
        r.integrate(r.t + dt)
    return func.counter


# In[27]:


def count_applications_mesolve(nt):
    tlist = np.linspace(0, 10, nt)
    return mesolve(L, rho0, tlist) / len(tlist)


# In[28]:


count_applications_mesolve(10)


# In[29]:


count_applications_mesolve(100)