Hard Core Tutorial

Typically in fitting, performance matters. Python is slow since it does tons of extra stuff(name lookup etc.) for a good reason. We can fix that with cython and numpy. This tutorial will demonstate how one would write a model which can take data and fit to the data. We will be demonstrating two ways: fastway and generic way. As a bonus we will show how to parallelize your cost function.

Basic Cython

Before we go on let's talk about how to use cython efficiently. Cython speed things up by using static type where it can. Generally the more type information you tell them the better it can generate C code.

Cython has a very handy option call annotate which lets you know which line of code is static type which one make a call to python object.

In [1]:
%pylab inline
%load_ext Cython
from iminuit import Minuit
Populating the interactive namespace from numpy and matplotlib
In [2]:
%%cython --annotate

def slow_f(n):
    x = 100.
    for i in range(n):
        x+=n
    return x

#you tell it more type information like this
def fast_f(int n):
    cdef double x=100.
    cdef int i
    for i in range(n):
        x+=n
    return x
Out[2]:
Cython: _cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3.pyx

Generated by Cython 0.27.3

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

 01: 
+02: def slow_f(n):
/* Python wrapper */
static PyObject *__pyx_pw_46_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3_1slow_f(PyObject *__pyx_self, PyObject *__pyx_v_n); /*proto*/
static PyMethodDef __pyx_mdef_46_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3_1slow_f = {"slow_f", (PyCFunction)__pyx_pw_46_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3_1slow_f, METH_O, 0};
static PyObject *__pyx_pw_46_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3_1slow_f(PyObject *__pyx_self, PyObject *__pyx_v_n) {
  PyObject *__pyx_r = 0;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("slow_f (wrapper)", 0);
  __pyx_r = __pyx_pf_46_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3_slow_f(__pyx_self, ((PyObject *)__pyx_v_n));

  /* function exit code */
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}

static PyObject *__pyx_pf_46_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3_slow_f(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_n) {
  PyObject *__pyx_v_x = NULL;
  CYTHON_UNUSED PyObject *__pyx_v_i = NULL;
  PyObject *__pyx_r = NULL;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("slow_f", 0);
/* … */
  /* function exit code */
  __pyx_L1_error:;
  __Pyx_XDECREF(__pyx_t_1);
  __Pyx_XDECREF(__pyx_t_2);
  __Pyx_AddTraceback("_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3.slow_f", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __pyx_r = NULL;
  __pyx_L0:;
  __Pyx_XDECREF(__pyx_v_x);
  __Pyx_XDECREF(__pyx_v_i);
  __Pyx_XGIVEREF(__pyx_r);
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}
/* … */
  __pyx_tuple_ = PyTuple_Pack(3, __pyx_n_s_n, __pyx_n_s_x, __pyx_n_s_i); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 2, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_tuple_);
  __Pyx_GIVEREF(__pyx_tuple_);
/* … */
  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3_1slow_f, NULL, __pyx_n_s_cython_magic_3326084b0b6a09d518); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_slow_f, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_codeobj__2 = (PyObject*)__Pyx_PyCode_New(1, 0, 3, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple_, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_Users_hdembins_ipython_cython, __pyx_n_s_slow_f, 2, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__2)) __PYX_ERR(0, 2, __pyx_L1_error)
+03:     x = 100.
  __Pyx_INCREF(__pyx_float_100_);
  __pyx_v_x = __pyx_float_100_;
+04:     for i in range(n):
  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 4, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __Pyx_INCREF(__pyx_v_n);
  __Pyx_GIVEREF(__pyx_v_n);
  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_n);
  __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_range, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 4, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  if (likely(PyList_CheckExact(__pyx_t_2)) || PyTuple_CheckExact(__pyx_t_2)) {
    __pyx_t_1 = __pyx_t_2; __Pyx_INCREF(__pyx_t_1); __pyx_t_3 = 0;
    __pyx_t_4 = NULL;
  } else {
    __pyx_t_3 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 4, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_1);
    __pyx_t_4 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 4, __pyx_L1_error)
  }
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  for (;;) {
    if (likely(!__pyx_t_4)) {
      if (likely(PyList_CheckExact(__pyx_t_1))) {
        if (__pyx_t_3 >= PyList_GET_SIZE(__pyx_t_1)) break;
        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
        __pyx_t_2 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_3); __Pyx_INCREF(__pyx_t_2); __pyx_t_3++; if (unlikely(0 < 0)) __PYX_ERR(0, 4, __pyx_L1_error)
        #else
        __pyx_t_2 = PySequence_ITEM(__pyx_t_1, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 4, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_2);
        #endif
      } else {
        if (__pyx_t_3 >= PyTuple_GET_SIZE(__pyx_t_1)) break;
        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
        __pyx_t_2 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_3); __Pyx_INCREF(__pyx_t_2); __pyx_t_3++; if (unlikely(0 < 0)) __PYX_ERR(0, 4, __pyx_L1_error)
        #else
        __pyx_t_2 = PySequence_ITEM(__pyx_t_1, __pyx_t_3); __pyx_t_3++; if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 4, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_2);
        #endif
      }
    } else {
      __pyx_t_2 = __pyx_t_4(__pyx_t_1);
      if (unlikely(!__pyx_t_2)) {
        PyObject* exc_type = PyErr_Occurred();
        if (exc_type) {
          if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
          else __PYX_ERR(0, 4, __pyx_L1_error)
        }
        break;
      }
      __Pyx_GOTREF(__pyx_t_2);
    }
    __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_2);
    __pyx_t_2 = 0;
/* … */
  }
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+05:         x+=n
    __pyx_t_2 = PyNumber_InPlaceAdd(__pyx_v_x, __pyx_v_n); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 5, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
    __Pyx_DECREF_SET(__pyx_v_x, __pyx_t_2);
    __pyx_t_2 = 0;
+06:     return x
  __Pyx_XDECREF(__pyx_r);
  __Pyx_INCREF(__pyx_v_x);
  __pyx_r = __pyx_v_x;
  goto __pyx_L0;
 07: 
 08: #you tell it more type information like this
+09: def fast_f(int n):
/* Python wrapper */
static PyObject *__pyx_pw_46_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3_3fast_f(PyObject *__pyx_self, PyObject *__pyx_arg_n); /*proto*/
static PyMethodDef __pyx_mdef_46_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3_3fast_f = {"fast_f", (PyCFunction)__pyx_pw_46_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3_3fast_f, METH_O, 0};
static PyObject *__pyx_pw_46_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3_3fast_f(PyObject *__pyx_self, PyObject *__pyx_arg_n) {
  int __pyx_v_n;
  PyObject *__pyx_r = 0;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("fast_f (wrapper)", 0);
  assert(__pyx_arg_n); {
    __pyx_v_n = __Pyx_PyInt_As_int(__pyx_arg_n); if (unlikely((__pyx_v_n == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 9, __pyx_L3_error)
  }
  goto __pyx_L4_argument_unpacking_done;
  __pyx_L3_error:;
  __Pyx_AddTraceback("_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3.fast_f", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __Pyx_RefNannyFinishContext();
  return NULL;
  __pyx_L4_argument_unpacking_done:;
  __pyx_r = __pyx_pf_46_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3_2fast_f(__pyx_self, ((int)__pyx_v_n));

  /* function exit code */
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}

static PyObject *__pyx_pf_46_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3_2fast_f(CYTHON_UNUSED PyObject *__pyx_self, int __pyx_v_n) {
  double __pyx_v_x;
  CYTHON_UNUSED int __pyx_v_i;
  PyObject *__pyx_r = NULL;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("fast_f", 0);
/* … */
  /* function exit code */
  __pyx_L1_error:;
  __Pyx_XDECREF(__pyx_t_3);
  __Pyx_AddTraceback("_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3.fast_f", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __pyx_r = NULL;
  __pyx_L0:;
  __Pyx_XGIVEREF(__pyx_r);
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}
/* … */
  __pyx_tuple__3 = PyTuple_Pack(4, __pyx_n_s_n, __pyx_n_s_n, __pyx_n_s_x, __pyx_n_s_i); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(0, 9, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_tuple__3);
  __Pyx_GIVEREF(__pyx_tuple__3);
/* … */
  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_3326084b0b6a09d518fcf08bbdc8d5b3_3fast_f, NULL, __pyx_n_s_cython_magic_3326084b0b6a09d518); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_fast_f, __pyx_t_1) < 0) __PYX_ERR(0, 9, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+10:     cdef double x=100.
  __pyx_v_x = 100.;
 11:     cdef int i
+12:     for i in range(n):
  __pyx_t_1 = __pyx_v_n;
  for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
    __pyx_v_i = __pyx_t_2;
+13:         x+=n
    __pyx_v_x = (__pyx_v_x + __pyx_v_n);
  }
+14:     return x
  __Pyx_XDECREF(__pyx_r);
  __pyx_t_3 = PyFloat_FromDouble(__pyx_v_x); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 14, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __pyx_r = __pyx_t_3;
  __pyx_t_3 = 0;
  goto __pyx_L0;
 15: 

You can see that there is yellow code line and white code line.

  • yellow code line means calling to python code
  • white code line means native C

Basically, your goal is to get as many white lines as possible. By telling as much type information as possible.

You can also click on each line to see what code cython actually generate for you. (You many need to double click it)

In [3]:
%timeit -n10 -r10 slow_f(100)
The slowest run took 24.52 times longer than the fastest. This could mean that an intermediate result is being cached.
10 loops, best of 10: 5.89 µs per loop
In [4]:
%timeit -n10 -r10 fast_f(100)
10 loops, best of 10: 95.4 ns per loop

Quick And Dirty way

Let's look at how to write a cython cost function

In [33]:
np.random.seed(0)
data = 2 + 3 * randn(int(2e6)) #mu=2, sigma=3
hist(data, bins=100, histtype='step');
In [34]:
%%cython --force
# use --annotate if you wonder what kind of code it generates
cimport cython
import numpy as np
cimport numpy as np  # overwritten those from python with cython
from libc.math cimport exp, M_PI, sqrt, log
from iminuit.util import describe, make_func_code

@cython.embedsignature(True)  # dump the signatre so describe works
cpdef double mypdf(double x, double mu, double sigma):
    #cpdef means generate both c function and python function
    cdef double norm = 1./(sqrt(2*M_PI)*sigma)
    cdef double ret = exp(-1*(x-mu)*(x-mu)/(2.*sigma*sigma))*norm
    return ret

cdef class QuickAndDirtyLogLH:  # cdef is here to reduce name lookup for __call__
    cdef np.ndarray data
    cdef int ndata
    
    def __init__(self, data):
        self.data = data
        self.ndata = len(data)
    
    @cython.embedsignature(True)  # you need this to dump function signature in docstring
    def compute(self, double mu, double sigma):
        #this line is a cast not a copy. Let cython knows mydata will spit out double
        cdef np.ndarray[np.double_t, ndim=1] mydata = self.data
        cdef double loglh = 0.
        cdef double thisdata
        for i in range(self.ndata):
            thisdata = mydata[i]
            loglh -= log(mypdf(mydata[i],mu,sigma))
        return loglh
In [35]:
describe(mypdf)  # works because we embedded the signature
Out[35]:
[u'x', u'mu', u'sigma']
In [36]:
lh = QuickAndDirtyLogLH(data)
describe(lh.compute)  # works because we embedded the signature
Out[36]:
[u'mu', u'sigma']
In [37]:
m = Minuit(lh.compute, mu=1.5, sigma=2.5, error_mu=0.1, 
    error_sigma=0.1, limit_sigma=(0.1,10.0), errordef=1)
m.migrad();

FCN = 5033909.39654 TOTAL NCALL = 44 NCALLS = 44
EDM = 1.41205900229e-05 GOAL EDM = 1e-05 UP = 1.0
Valid Valid Param Accurate Covar PosDef Made PosDef
True True True True False
Hesse Fail HasCov Above EDM Reach calllim
False True False False
+ Name Value Hesse Error Minos Error- Minos Error+ Limit- Limit+ Fixed?
0 mu 2.00272 0.00299822 No
1 sigma 2.99822 0.00212007 0.1 10 No

In [38]:
%%timeit -n2 -r2 #-n1 -r1 
m = Minuit(lh.compute, mu=1.5, sigma=2.5, error_mu=0.1, 
    error_sigma=0.1, limit_sigma=(0.1,10.0),print_level=0, errordef=1)
m.migrad()
2 loops, best of 2: 2.01 s per loop

Have your cython PDF in a separate file

Lots of time your stuff is incredibly complicated and doesn't fit in ipython notebook. Or you may want to reuse your PDF in many notebooks. We have external_pdf.pyx in the same directory as this tutorial. This is how you load it.

In [39]:
import pyximport;
pyximport.install(
    setup_args=dict(
        include_dirs=[np.get_include()],#include directory
    #    libraries = ['m']#'stuff you need to link (no -l)
    #    library_dirs ['some/dir']#library dir
    #    extra_compile_args = ['-g','-O2'],
    #    extra_link_args=['-some-link-flags'],
    ),
    reload_support=True,#you may also find this useful
) #if anything funny is going on look at your console
import external_pdf
In [40]:
# reload(external_pdf) #you may find this useful for reloading your module
In [41]:
lh = external_pdf.External_LogLH(data)
In [42]:
m = Minuit(lh.compute,mu=1.5, sigma=2.5, error_mu=0.1, 
    error_sigma=0.1, limit_sigma=(0.1,10.0), errordef=1)
m.migrad();

FCN = 5033909.39654 TOTAL NCALL = 44 NCALLS = 44
EDM = 1.41205900229e-05 GOAL EDM = 1e-05 UP = 1.0
Valid Valid Param Accurate Covar PosDef Made PosDef
True True True True False
Hesse Fail HasCov Above EDM Reach calllim
False True False False
+ Name Value Hesse Error Minos Error- Minos Error+ Limit- Limit+ Fixed?
0 mu 2.00272 0.00299822 No
1 sigma 2.99822 0.00212007 0.1 10 No

In [43]:
%%timeit -r2 -n2 
m = Minuit(lh.compute,mu=1.5, sigma=2.5, error_mu=0.1, 
    error_sigma=0.1, limit_sigma=(0.1,10.0), print_level=0, errordef=1)
m.migrad()
2 loops, best of 2: 2.18 s per loop

Generic Reusable Cost Function

Sometime we want to write a cost function that will take in any pdf and data and compute appropriate cost function. This is slower than the previous example but will make your code much more reusable. This is how you do it.

In [44]:
%%cython
# use --annotate if you wonder what kind of code it generates
cimport cython
import numpy as np
cimport numpy as np  # overwritten those from python with cython
from iminuit.util import make_func_code, describe
from libc.math cimport log

cdef class LogLH:  # cdef is here to reduce name lookup for __call__(think of struct)
    cdef np.ndarray data
    cdef int ndata
    cdef public func_code
    cdef object pdf
    
    def __init__(self, pdf, data):
        self.data = data
        self.ndata = len(data)
        #the important line is here
        self.func_code = make_func_code(describe(pdf)[1:])#1: dock off independent param
        self.pdf = pdf
    
    #@cython.boundscheck(False)  # you can turn off bound checking
    def __call__(self, *args):
        cdef np.ndarray[np.double_t, ndim=1] mydata = self.data  # this line is very important
        #with out this line cython will have no idea about data type
        cdef double loglh = 0.
        cdef list larg = [0.]+list(args)
        
        for i in range(self.ndata):
            #it's slower because we need to do so many python stuff
            #to do generic function call
            #if you are python hacker and know how to get around this
            #please let us know
            larg[0] = mydata[i]
            loglh -= log(self.pdf(*larg))
        return loglh

And your favorite PDF

In [45]:
%%cython
#use --annotate if you wonder what kind of code it generates
from libc.math cimport exp, M_PI, sqrt, log
cimport cython

@cython.binding(True)
def mypdf(double x, double mu, double sigma): 
    #cpdef means generate both c function and python function
    cdef double norm = 1./(sqrt(2*M_PI)*sigma)
    cdef double ret = exp(-1*(x-mu)*(x-mu)/(2.*sigma*sigma))*norm
    return ret
In [46]:
mylh = LogLH(mypdf,data)
In [47]:
print(describe(mypdf))
print(describe(mylh))
['x', 'mu', 'sigma']
['mu', 'sigma']
In [48]:
describe(mypdf)
Out[48]:
['x', 'mu', 'sigma']
In [49]:
m = Minuit(mylh, mu=1.5, sigma=2.5, error_mu=0.1, 
           error_sigma=0.1, limit_sigma=(0.1,10.0), errordef=1)
# it's slower than before
m.migrad();

FCN = 5033909.39654 TOTAL NCALL = 44 NCALLS = 44
EDM = 1.41205900229e-05 GOAL EDM = 1e-05 UP = 1.0
Valid Valid Param Accurate Covar PosDef Made PosDef
True True True True False
Hesse Fail HasCov Above EDM Reach calllim
False True False False
+ Name Value Hesse Error Minos Error- Minos Error+ Limit- Limit+ Fixed?
0 mu 2.00272 0.00299822 No
1 sigma 2.99822 0.00212007 0.1 10 No

In [50]:
%%timeit -r2 -n2
m=Minuit(mylh, mu=1.5, sigma=2.5, error_mu=0.1, 
    error_sigma=0.1, limit_sigma=(0.1,10.0), print_level=0, errordef=1)
m.migrad() # you can feel it's much slower than before
2 loops, best of 2: 6.34 s per loop
In [55]:
# show histogram of data and fitted pdf
x = linspace(-10,12,100)
initial = np.fromiter((mypdf(xx, 1.5, 2.5) for xx in x), float);
fit = np.fromiter((mypdf(xx, m.values['mu'], m.values['sigma']) for xx in x), float);
plot(x, initial, label='initial values')
plot(x, fit, label='best fit')
hist(data, normed=True, bins=100, histtype='step', label='data')
legend();

Parallel processing with multiprocessing

This example is most likely not working on windows. You need to implement a bunch of workaround for lack of fork() on Windows such that it's easier to install a decent linux on your computer.

The idea here on how to parallelize your cost function is to separate you data into multiple chunks and have each worker calculate your cost function, collect them at the end and add them up.

The tool we will be showing here is Python multiprocess. We will be rewriting our generic cost funciton but now with multiprocessing.

You might be worried about that forking process will copy your data. Most modern OS use Copy On Write(look at wiki) what this means is that when it forks a process it doesn't copy memory there unless you are writing it

In [56]:
%%cython -f
cimport cython
import numpy as np
cimport numpy as np #overwritten those from python with cython
from libc.math cimport exp, M_PI, sqrt, log, floor
from libc.stdio cimport printf
from iminuit.util import make_func_code, describe
import multiprocessing as mp
from multiprocessing import Manager
import os
#import logging
#logger = multiprocessing.log_to_stderr()
#don't do this in ipython either it will crash(watch ipython #2438)
#logger.setLevel(multiprocessing.SUBDEBUG)

@cython.embedsignature(True)#dump the signatre so describe works
cpdef double mypdf(double x, double mu, double sigma):
    #cpdef means generate both c function and python function
    cdef double norm
    cdef double ret
    norm = 1./(sqrt(2*M_PI)*sigma)
    ret = exp(-1*(x-mu)*(x-mu)/(2.*sigma*sigma))*norm
    return ret

cdef class Multiprocess_LogLH:#cdef is here to reduce name lookup for __call__
    cdef np.ndarray data
    #cdef double* databuffer#i'm not really sure if numpy will do copy on write only or not
    cdef int ndata
    cdef public func_code
    cdef object pdf
    cdef int njobs
    cdef list starts
    cdef list stops
    cdef object pool
    cdef int i
    cdef object manager
    cdef object results
    def __init__(self, pdf, np.ndarray[np.double_t] data, njobs=None):
        self.data = data
        #self.databuffer = <double*>data.data
        self.ndata = len(data)
        
        #the important line is here
        self.func_code = make_func_code(describe(pdf)[1:])#1: dock off independent param
        self.pdf = pdf
        
        #worker pool stuff
        self.njobs = njobs if njobs is not None else mp.cpu_count()
        print('Number of CPU: ',self.njobs)
        #determine chunk size
        chunk_size = floor(self.ndata/self.njobs)
        self.starts = [i*chunk_size for i in range(self.njobs)]
        self.stops = [(i+1)*chunk_size for i in range(self.njobs)]
        self.stops[-1] = self.ndata #add back last couple data from round off
        self.i=0
        self.manager = Manager()
        self.results = self.manager.Queue()
    
    @cython.embedsignature(True)
    cpdef process_chunk(self, 
            int pid, int start, int stop, tuple args, object results): #start stop is [start, stop) 
        #be careful here there is a bug in ipython preventing
        #child process from printing to stdout/stderr (you will get a segfault)
        #I submitted a patch https://github.com/ipython/ipython/pull/2712
        #Ex: For now, do something like this if you need to debug
        #msg = str(('Start Worker:', pid, start, stop,os.getpid()))+'\n'
        #printf(msg)
        #it will run fine as a python script though
        cdef np.ndarray[np.double_t, ndim=1] mydata = self.data#get cython to know the type
        cdef int i
        cdef double loglh = 0.
        cdef double tmp = 0.
        cdef tuple t
        #calculate lh for this chunk
        cdef list larg = [0.]+list(args)
        for i in range(start,stop):
            tmp = mydata[i]
            larg[0] = tmp
            loglh -= log(self.pdf(*larg))
        results.put(loglh) #put result in multiprocessing.queue
        return loglh#return isn't necessary since it will be ignored but useful for testing

    def __call__(self, *args):
        cdef double ret=0
        pool = [mp.Process(target=self.process_chunk,
                           args=(i,self.starts[i],self.stops[i],args,self.results)) 
                    for i in range(self.njobs)]
        #you may think that forking this many time is inefficient
        #We can do better but this is not bad. Since most of the time
        #will be spend on calculating your loglh this is cheap compared to those.
        #If forking is more expensive than what each worker does then... 
        #your problem is something else.
        #Copy on write ensures that data points will never be copied (unless your write to it)
        self.i+=1
        for p in pool: p.start() #start everyone
        for p in pool: p.join() #wait for everyone to finish
        while not self.results.empty(): #collect the result
            tmp = self.results.get()
            ret += tmp
        return ret
In [57]:
mlh = Multiprocess_LogLH(mypdf,data)
('Number of CPU: ', 4)
In [58]:
#good idea to debug it in non-multiprocess environment first
import multiprocessing as mp
q = mp.Queue()
mlh.process_chunk(0,0,10,(0,1),q)
Out[58]:
140.06719362225851
In [59]:
#then see if it works on one point
mlh(0,1)
Out[59]:
14838060.237849694
In [60]:
m = Minuit(mlh,mu=1.5, sigma=2.5, error_mu=0.1, 
    error_sigma=0.1, limit_sigma=(0.1,10.0), errordef=1)
m.migrad();

FCN = 5033909.39654 TOTAL NCALL = 44 NCALLS = 44
EDM = 1.41124098393e-05 GOAL EDM = 1e-05 UP = 1.0
Valid Valid Param Accurate Covar PosDef Made PosDef
True True True True False
Hesse Fail HasCov Above EDM Reach calllim
False True False False
+ Name Value Hesse Error Minos Error- Minos Error+ Limit- Limit+ Fixed?
0 mu 2.00272 0.00299822 No
1 sigma 2.99822 0.00212007 0.1 10 No

In [61]:
%%timeit -r2 -n2
m = Minuit(mlh,mu=1.5, sigma=2.5, error_mu=0.1, 
    error_sigma=0.1, limit_sigma=(0.1,10.0),print_level=0, errordef=1)
m.migrad()
2 loops, best of 2: 4.68 s per loop

Parallel Computing With Cython and OpenMP

For this tutorial you will need a compiler with openmp support. GCC has one. However, clang does NOT support it.

This is not recommended. Do this only when overhead of multiprocessing is too big for you. The performance gain is probably not worth your headache.

Computer nowadays are multi-core machines so it makes sense to utilize all of them. This method is fast but quite restricted and cubersome since you need to write function such that cython can figure out its reentrant-ness. And you need some understanding of thread-local and thread-share variable.

You can read prange from cython wiki for more information and how to gain a more complete control over paralelization. The official documentation is here

In [62]:
%%cython -f -c-fopenmp --link-args=-fopenmp -c-g
#use --annotate if you wonder what kind of code it generates
cimport cython
import numpy as np
cimport numpy as np #overwritten those from python with cython
from libc.math cimport exp, M_PI, sqrt, log
from iminuit.util import describe, make_func_code
import multiprocessing
from cython.parallel import prange


#notice nogil a the end (no global intepreter lock)
#cython doesn't do a super thorough check for this
#so make sure your function is reentrant this means approximately 
#just simple function compute simple stuff based on local stuff and no read/write to global 
@cython.embedsignature(True)#dump the signatre so describe works
@cython.cdivision(True)
cpdef double mypdf(double x, double mu, double sigma) nogil:
    #cpdef means generate both c function and python function
    cdef double norm
    cdef double ret
    norm = 1./(sqrt(2*M_PI)*sigma)
    ret = exp(-1*(x-mu)*(x-mu)/(2.*sigma*sigma))*norm
    return ret

cdef class ParallelLogLH:#cdef is here to reduce name lookup for __call__
    cdef np.ndarray data
    cdef int ndata
    cdef int njobs
    cdef np.ndarray buf#buffer for storing result from each job
    def __init__(self, data, njobs=None):
        self.data = data
        self.ndata = len(data)
        self.njobs = njobs if njobs is not None else multiprocessing.cpu_count()
        print('Number of CPU: ',self.njobs)
    
    @cython.boundscheck(False)
    @cython.embedsignature(True)#you need this to dump function signature in docstring
    def compute(self, double mu, double sigma):
        cdef np.ndarray[np.double_t, ndim=1] mydata = self.data
        cdef double loglh = 0.
        cdef tuple t
        cdef double thisdata
        cdef int i=0
        #in parallel computing you need to be careful which variable is
        #thread private which variable is shared between thread
        #otherwise you will get into hard to detect racing condition
        #cython rule of thumb(guess rule) is
        # 1) assigned before use is thread private
        # 2) read-only is thread-shared
        # 3) inplace modification only is thread shared
        cdef int njobs = self.njobs
        cdef double tmp
        with nogil:
            for i in prange(self.ndata, 
                            num_threads=njobs, 
                            chunksize=100000, 
                            schedule='dynamic'):#split into many threads
                thisdata = mydata[i] #this is assigned before read so it's thread private
                tmp  = mypdf(thisdata,mu,sigma) #also here assigne before read
                loglh -= log(tmp) #inplace modification so loglh is thread shared
        return loglh
---------------------------------------------------------------------------
CompileError                              Traceback (most recent call last)
<ipython-input-62-ab6c2a0be7e3> in <module>()
----> 1 get_ipython().run_cell_magic(u'cython', u'-f -c-fopenmp --link-args=-fopenmp -c-g', u"#use --annotate if you wonder what kind of code it generates\ncimport cython\nimport numpy as np\ncimport numpy as np #overwritten those from python with cython\nfrom libc.math cimport exp, M_PI, sqrt, log\nfrom iminuit.util import describe, make_func_code\nimport multiprocessing\nfrom cython.parallel import prange\n\n\n#notice nogil a the end (no global intepreter lock)\n#cython doesn't do a super thorough check for this\n#so make sure your function is reentrant this means approximately \n#just simple function compute simple stuff based on local stuff and no read/write to global \[email protected](True)#dump the signatre so describe works\[email protected](True)\ncpdef double mypdf(double x, double mu, double sigma) nogil:\n    #cpdef means generate both c function and python function\n    cdef double norm\n    cdef double ret\n    norm = 1./(sqrt(2*M_PI)*sigma)\n    ret = exp(-1*(x-mu)*(x-mu)/(2.*sigma*sigma))*norm\n    return ret\n\ncdef class ParallelLogLH:#cdef is here to reduce name lookup for __call__\n    cdef np.ndarray data\n    cdef int ndata\n    cdef int njobs\n    cdef np.ndarray buf#buffer for storing result from each job\n    def __init__(self, data, njobs=None):\n        self.data = data\n        self.ndata = len(data)\n        self.njobs = njobs if njobs is not None else multiprocessing.cpu_count()\n        print('Number of CPU: ',self.njobs)\n    \n    @cython.boundscheck(False)\n    @cython.embedsignature(True)#you need this to dump function signature in docstring\n    def compute(self, double mu, double sigma):\n        cdef np.ndarray[np.double_t, ndim=1] mydata = self.data\n        cdef double loglh = 0.\n        cdef tuple t\n        cdef double thisdata\n        cdef int i=0\n        #in parallel computing you need to be careful which variable is\n        #thread private which variable is shared between thread\n        #otherwise you will get into hard to detect racing condition\n        #cython rule of thumb(guess rule) is\n        # 1) assigned before use is thread private\n        # 2) read-only is thread-shared\n        # 3) inplace modification only is thread shared\n        cdef int njobs = self.njobs\n        cdef double tmp\n        with nogil:\n            for i in prange(self.ndata, \n                            num_threads=njobs, \n                            chunksize=100000, \n                            schedule='dynamic'):#split into many threads\n                thisdata = mydata[i] #this is assigned before read so it's thread private\n                tmp  = mypdf(thisdata,mu,sigma) #also here assigne before read\n                loglh -= log(tmp) #inplace modification so loglh is thread shared\n        return loglh")

/Users/hdembins/Library/Python/2.7/lib/python/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2113             magic_arg_s = self.var_expand(line, stack_depth)
   2114             with self.builtin_trap:
-> 2115                 result = fn(magic_arg_s, cell)
   2116             return result
   2117 

<decorator-gen-124> in cython(self, line, cell)

/Users/hdembins/Library/Python/2.7/lib/python/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

/Users/hdembins/Library/Python/2.7/lib/python/site-packages/Cython/Build/IpythonMagic.pyc in cython(self, line, cell)
    327 
    328         self._build_extension(extension, lib_dir, pgo_step_name='use' if args.pgo else None,
--> 329                               quiet=args.quiet)
    330 
    331         module = imp.load_dynamic(module_name, module_path)

/Users/hdembins/Library/Python/2.7/lib/python/site-packages/Cython/Build/IpythonMagic.pyc in _build_extension(self, extension, lib_dir, temp_dir, pgo_step_name, quiet)
    437             if not quiet:
    438                 old_threshold = distutils.log.set_threshold(distutils.log.DEBUG)
--> 439             build_extension.run()
    440         finally:
    441             if not quiet and old_threshold is not None:

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/distutils/command/build_ext.pyc in run(self)
    335 
    336         # Now actually compile and link everything.
--> 337         self.build_extensions()
    338 
    339     def check_extensions_list(self, extensions):

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/distutils/command/build_ext.pyc in build_extensions(self)
    444 
    445         for ext in self.extensions:
--> 446             self.build_extension(ext)
    447 
    448     def build_extension(self, ext):

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/distutils/command/build_ext.pyc in build_extension(self, ext)
    494                                          debug=self.debug,
    495                                          extra_postargs=extra_args,
--> 496                                          depends=ext.depends)
    497 
    498         # XXX -- this is a Vile HACK!

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/distutils/ccompiler.pyc in compile(self, sources, output_dir, macros, include_dirs, debug, extra_preargs, extra_postargs, depends)
    572             except KeyError:
    573                 continue
--> 574             self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
    575 
    576         # Return *all* object filenames, not just the ones we just built.

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/distutils/unixccompiler.pyc in _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts)
    123                        extra_postargs)
    124         except DistutilsExecError, msg:
--> 125             raise CompileError, msg
    126 
    127     def create_static_lib(self, objects, output_libname,

CompileError: command 'cc' failed with exit status 1
In [ ]:
plh = ParallelLogLH(data)
In [ ]:
plh.compute(1.5,2.0)
In [ ]:
describe(plh.compute)
In [ ]:
m=Minuit(plh.compute,mu=1.5, sigma=2.5, error_mu=0.1, 
    error_sigma=0.1, limit_sigma=(0.1,10.0),print_level=0)
m.migrad();
In [ ]:
%%timeit -n2 -r2
m=Minuit(plh.compute,mu=1.5, sigma=2.5, error_mu=0.1, 
    error_sigma=0.1, limit_sigma=(0.1,10.0),print_level=0)
m.migrad()

OpenCL

Want to do fitting using parallel GPU core? If you have multicore GPU Cluster/Machine figure out how to do it and let me know.

I don't have the machine with such capability so I never bother....Interested in buying one for me? :P