Exercises for Session 3

In [1]:
from __future__ import print_function
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats, optimize

# better-looking plots
plt.rcParams['font.family'] = 'serif'
plt.rcParams['figure.figsize'] = (10.0, 8.0)
plt.rcParams['font.size'] = 18
In [2]:
# create 100 samples from a Normal distribution
# with mean 5 and standard deviation 1
np.random.seed(54321)
n = 100
m = 5
s = 1
d = stats.norm.rvs(m, s, size=n)
In [3]:
# histogram the data - this can be done with numpy,
# or simultaneously with plotting with pyplot
#hx, hy = numpy.histogram(d, bins=10)
hy, hx, p = plt.hist(d, bins=10, histtype='stepfilled')
In [4]:
# get the width of the bins and hence normalisation
w = hx[1]-hx[0]
a = n*w
# get the centres of the bins
hxc = 0.5*(hx[:-1] + hx[1:])
In [5]:
# returns the value of a Gaussian with specified amplitude,
# mean and sigma, evaluated at x (just for convenience)
def g((m, s), a, x):
    return a * stats.norm.pdf(x, m, s)
In [6]:
# returns chisq of specified Gaussian for data (x, y)
def f((m, s), a, x, y):
    return ((y - g((m, s), a, x))**2).sum()
In [7]:
# find parameter set which minimises chisq
results = optimize.minimize(f, (3, 3), args=(a, hxc, hy))
results  # fit is unsuccessful!
Out[7]:
   status: 2
  success: False
     njev: 42
     nfev: 168
 hess_inv: array([[1, 0],
       [0, 1]])
      fun: nan
        x: array([ 157151.3125  ,  -47082.265625])
  message: 'Desired error not necessarily achieved due to precision loss.'
      jac: array([ nan,  nan])
In [8]:
# Use a simpler method
results = optimize.minimize(f, (3, 3), args=(a, hxc, hy), method='Nelder-Mead')
results  # success!
Out[8]:
  status: 0
    nfev: 75
 success: True
     fun: 101.20418191381646
       x: array([ 5.26030092,  0.85567627])
 message: 'Optimization terminated successfully.'
     nit: 39
In [9]:
# Specify parameter bounds
results = optimize.minimize(f, (3, 3), args=(a, hxc, hy), bounds=((None, None), (0.01, None)))
results  # success!
Out[9]:
  status: 0
 success: True
    nfev: 108
     fun: 101.20417934635323
       x: array([ 5.26031668,  0.8557135 ])
 message: 'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
     jac: array([  4.26325641e-06,  -1.42108547e-06])
     nit: 15
In [10]:
# Include parameter bounds in chisq function
def f((m, s), a, x, y):
    if s <= 0.0:
        return 1e99
    else:
        return ((y - g((m, s), a, x))**2).sum()
    
results = optimize.minimize(f, (3, 3), args=(a, hxc, hy))
results  # success!
Out[10]:
   status: 0
  success: True
     njev: 25
     nfev: 106
 hess_inv: array([[  4.15877622e-04,   1.68965375e-05],
       [  1.68965375e-05,   3.05023445e-04]])
      fun: 101.20417934635336
        x: array([ 5.26031668,  0.8557135 ])
  message: 'Optimization terminated successfully.'
      jac: array([ -9.53674316e-07,  -5.72204590e-06])
In [11]:
# get fit parameters
r1 = results.x
In [12]:
# use MLE method provided in stats (actually understands that this is samples from a distribution)
r2 = stats.norm.fit(d)
In [13]:
# plot true function and fit results over histogram
x = numpy.arange(m-3*s, m+3*s, s/10.0)
plt.hist(d, bins=hx, histtype='stepfilled', linestyle=None,
         alpha=0.1, label='sample')
plt.plot(x, g((m, s), a, x), ':', label='true', lw=3)
plt.plot(x, g(r1, a, x), label='optimize.fmin', lw=2)
plt.plot(x, g(r2, a, x), label='stats.norm.fit', lw=2)
plt.legend(prop={'size':'small'})

# neatly print results to screen
print('true function:  mean = {:5.3f}, sigma = {:5.3f}'.format(m, s))
print('optimize.fmin:  mean = {:5.3f}, sigma = {:5.3f}'.format(*r1))
print('stats.norm.fit: mean = {:5.3f}, sigma = {:5.3f}'.format(*r2))
true function:  mean = 5.000, sigma = 1.000
optimize.fmin:  mean = 5.260, sigma = 0.856
stats.norm.fit: mean = 5.223, sigma = 0.939
In [14]:
# Additional note:
# trick for drawing a binned function without original sample
# add zero bins at each end of distribution
hhxc = np.concatenate(([hxc[0]-w], hxc, [hxc[-1]+w]))
hhy = np.concatenate(([0], hy, [0]))
plt.plot(hhxc, hhy, drawstyle='steps-mid', label='sample')
_ = plt.axis(ymax=30)
# unfortunately producing a corresponding filled histogram is very messy :-/