#!/usr/bin/env python # coding: utf-8 # # Benchmarks # In[1]: # NBVAL_IGNORE_OUTPUT get_ipython().run_line_magic('load_ext', 'watermark') import os import qutip import numpy as np from numpy import pi import matplotlib import matplotlib.pylab as plt import newtonprop import snakeviz import numba get_ipython().run_line_magic('watermark', '-v --iversions') # While only available interactively, `snakeviz` is a very good way to look at profiling data: # In[2]: get_ipython().run_line_magic('load_ext', 'snakeviz') # The benchmarks are based on the Examples, so we rerun the entire Example notebook in this context: # In[3]: get_ipython().run_cell_magic('capture', '', '%run example.ipynb\n') # ## Propagation runtime # In[4]: tlist = np.linspace(0, 10, 100) # First, we measure low long the propagation with QuTiP takes. This happens mostly in compiled code, so it is pretty fast. # In[5]: get_ipython().run_cell_magic('timeit', '', 'qutip.mesolve(L, rho0, tlist)\n') # In[6]: get_ipython().run_cell_magic('timeit', '', 'propagate_expm(L, rho0, tlist)\n') # The Newton propagator, being a reference implementation, is implemented in pure Python, and is several orders of magnitude slower. To make the comparison fair, we limit the precision to $10^{-8}$, which is roughly the precision of `mesolve`. # In[7]: get_ipython().run_cell_magic('timeit', '', 'propagate(L, rho0, tlist, zero_qutip, norm_qutip, inner_qutip, tol=1e-8)\n') # When using lower-level data types, things get considerably faster: # In[8]: get_ipython().run_cell_magic('timeit', '', 'propagate(apply_cythonized_L, rho0_data, tlist, zero_vectorized, norm_vectorized, inner_vectorized, tol=1e-8)\n') # In[9]: get_ipython().run_cell_magic('timeit', '', 'propagate(L_vectorized, rho0_vectorized, tlist, zero_vectorized, norm_vectorized, inner_vectorized, tol=1e-8)\n') # ## Profiling # We can profile the how much time is spent in the various routines, comparing `mesolve` and different variations of the Newton propagator. See https://docs.python.org/3/library/profile.html#instant-user-s-manual for the meaning of the colums. # ### mesolve # First, we look the QuTiP's `mesolve`: # In[10]: stats = get_ipython().run_line_magic('prun', '-q -r qutip.mesolve(L, rho0, tlist);') # We can look at which top-level routines we spent the most time in *cumulativly*, that is including sub-calls: # In[11]: stats.sort_stats('cumtime').print_stats(10); # Or, the bottom-level routines where we *actually* spent time: # In[12]: stats.sort_stats('tottime').print_stats(10); # This is dominated by the ODE solver and sparse matrix operations # If we're working interactively, we could use `snakeviz` to analyze further details: # In[13]: #%snakeviz qutip.mesolve(L, rho0, tlist) # ### qutip-propagation # Next, we look at the Newton propagator operating on high-level qutip objects: # In[14]: #%snakeviz propagate(L, rho0, tlist, zero_qutip, norm_qutip, inner_qutip, tol=1e-8) # In[15]: stats = get_ipython().run_line_magic('prun', '-q -r propagate(L, rho0, tlist, zero_qutip, norm_qutip, inner_qutip, tol=1e-8);') # In[16]: stats.sort_stats('cumtime').print_stats(10); # In[17]: stats.sort_stats('tottime').print_stats(10); # ### cythonized qutip-propagation # Lastly, we can look at the more efficient propagation using a cythonized application of the QuTiP objects: # In[18]: #%snakeviz propagate(apply_cythonized_L, rho0_data, tlist, zero_vectorized, norm_vectorized, inner_vectorized, tol=1e-8) # In[19]: stats = get_ipython().run_line_magic('prun', '-q -r propagate(apply_cythonized_L, rho0_data, tlist, zero_vectorized, norm_vectorized, inner_vectorized, tol=1e-8);') # In[20]: stats.sort_stats('cumtime').print_stats(10); # In[21]: stats.sort_stats('tottime').print_stats(10); # We now see that the (Python-level) arnoldi and step routines of Newton come out as the bottle neck, as the application of the Liouvillian has become efficient level (running at C speed). # Note that the Newton implementation, in particular the `_extend_leja` function has been sped up through the use of numba. Without that, `_extend_leja` would dominate. # ## Number of function evaluations # Since the Newton propagator is ultimately still limited by being implemented in Python, it is more fair to measure the runtime in terms of the number of average number of applications of the Liouvillian per time step. This is under the assumption that in an efficient implementation (and for large Hilbert spaces), this is the dominating factor. # # The number of applications depends on the chosen precision and on the length of the time step: longer time steps tend do be more efficient, as only then we're in a regime where the fast convergence of the Newton series kicks in. # We construct a dummy `Qobj` that counts its own applications to a state: # In[22]: class CountingQobj(qutip.Qobj): def __init__(self, *args, **kwargs): self.counter = 0 super().__init__(*args, **kwargs) def __mul__(self, other): if isinstance(other, qutip.Qobj): self.counter += 1 return super().__mul__(other) # In[23]: def count_applications_newton(nt, tol=1e-8): tlist = np.linspace(0, 10, nt) L_count = CountingQobj(L) propagate(L_count, rho0, tlist, zero_qutip, norm_qutip, inner_qutip, tol=1e-8) return L_count.counter / len(tlist) # In[24]: count_applications_newton(nt=10) # In[25]: count_applications_newton(nt=100) # To compare this to the average number of applications in `mesolve`, we use a trimmed-down version of the `mesolve` routine: # In[26]: import scipy from qutip.solver import Options from qutip.superoperator import mat2vec from qutip.cy.spmatfuncs import cy_ode_rhs from qutip.mesolve import _generic_ode_solve from qutip.ui.progressbar import BaseProgressBar def mesolve(L, rho0, tlist): opt = Options() def func(t, rho, data, ind, ptr): func.counter += 1 return cy_ode_rhs(t, rho, data, ind, ptr) func.counter = 0 r = scipy.integrate.ode(func) r.set_f_params(L.data.data, L.data.indices, L.data.indptr) r.set_integrator('zvode', method=opt.method, order=opt.order, atol=opt.atol, rtol=opt.rtol, nsteps=opt.nsteps, first_step=opt.first_step, min_step=opt.min_step, max_step=opt.max_step) initial_vector = mat2vec(rho0.full()).ravel('F') r.set_initial_value(initial_vector, tlist[0]) dt = tlist[1] - tlist[0] for step in range(len(tlist)-1): r.integrate(r.t + dt) return func.counter # In[27]: def count_applications_mesolve(nt): tlist = np.linspace(0, 10, nt) return mesolve(L, rho0, tlist) / len(tlist) # In[28]: count_applications_mesolve(10) # In[29]: count_applications_mesolve(100)