# Exercises for Session 3¶

In [1]:
from __future__ import print_function
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats, optimize

# better-looking plots
plt.rcParams['font.family'] = 'serif'
plt.rcParams['figure.figsize'] = (10.0, 8.0)
plt.rcParams['font.size'] = 18

In [2]:
# create 100 samples from a Normal distribution
# with mean 5 and standard deviation 1
np.random.seed(54321)
n = 100
m = 5
s = 1
d = stats.norm.rvs(m, s, size=n)

In [3]:
# histogram the data - this can be done with numpy,
# or simultaneously with plotting with pyplot
#hx, hy = numpy.histogram(d, bins=10)
hy, hx, p = plt.hist(d, bins=10, histtype='stepfilled')

In [4]:
# get the width of the bins and hence normalisation
w = hx[1]-hx[0]
a = n*w
# get the centres of the bins
hxc = 0.5*(hx[:-1] + hx[1:])

In [5]:
# returns the value of a Gaussian with specified amplitude,
# mean and sigma, evaluated at x (just for convenience)
def g((m, s), a, x):
return a * stats.norm.pdf(x, m, s)

In [6]:
# returns chisq of specified Gaussian for data (x, y)
def f((m, s), a, x, y):
return ((y - g((m, s), a, x))**2).sum()

In [7]:
# find parameter set which minimises chisq
results = optimize.minimize(f, (3, 3), args=(a, hxc, hy))
results  # fit is unsuccessful!

Out[7]:
   status: 2
success: False
njev: 42
nfev: 168
hess_inv: array([[1, 0],
[0, 1]])
fun: nan
x: array([ 157151.3125  ,  -47082.265625])
message: 'Desired error not necessarily achieved due to precision loss.'
jac: array([ nan,  nan])
In [8]:
# Use a simpler method
results = optimize.minimize(f, (3, 3), args=(a, hxc, hy), method='Nelder-Mead')
results  # success!

Out[8]:
  status: 0
nfev: 75
success: True
fun: 101.20418191381646
x: array([ 5.26030092,  0.85567627])
message: 'Optimization terminated successfully.'
nit: 39
In [9]:
# Specify parameter bounds
results = optimize.minimize(f, (3, 3), args=(a, hxc, hy), bounds=((None, None), (0.01, None)))
results  # success!

Out[9]:
  status: 0
success: True
nfev: 108
fun: 101.20417934635323
x: array([ 5.26031668,  0.8557135 ])
jac: array([  4.26325641e-06,  -1.42108547e-06])
nit: 15
In [10]:
# Include parameter bounds in chisq function
def f((m, s), a, x, y):
if s <= 0.0:
return 1e99
else:
return ((y - g((m, s), a, x))**2).sum()

results = optimize.minimize(f, (3, 3), args=(a, hxc, hy))
results  # success!

Out[10]:
   status: 0
success: True
njev: 25
nfev: 106
hess_inv: array([[  4.15877622e-04,   1.68965375e-05],
[  1.68965375e-05,   3.05023445e-04]])
fun: 101.20417934635336
x: array([ 5.26031668,  0.8557135 ])
message: 'Optimization terminated successfully.'
jac: array([ -9.53674316e-07,  -5.72204590e-06])
In [11]:
# get fit parameters
r1 = results.x

In [12]:
# use MLE method provided in stats (actually understands that this is samples from a distribution)
r2 = stats.norm.fit(d)

In [13]:
# plot true function and fit results over histogram
x = numpy.arange(m-3*s, m+3*s, s/10.0)
plt.hist(d, bins=hx, histtype='stepfilled', linestyle=None,
alpha=0.1, label='sample')
plt.plot(x, g((m, s), a, x), ':', label='true', lw=3)
plt.plot(x, g(r1, a, x), label='optimize.fmin', lw=2)
plt.plot(x, g(r2, a, x), label='stats.norm.fit', lw=2)
plt.legend(prop={'size':'small'})

# neatly print results to screen
print('true function:  mean = {:5.3f}, sigma = {:5.3f}'.format(m, s))
print('optimize.fmin:  mean = {:5.3f}, sigma = {:5.3f}'.format(*r1))
print('stats.norm.fit: mean = {:5.3f}, sigma = {:5.3f}'.format(*r2))

true function:  mean = 5.000, sigma = 1.000
optimize.fmin:  mean = 5.260, sigma = 0.856
stats.norm.fit: mean = 5.223, sigma = 0.939

In [14]:
# Additional note:
# trick for drawing a binned function without original sample
# add zero bins at each end of distribution
hhxc = np.concatenate(([hxc[0]-w], hxc, [hxc[-1]+w]))
hhy = np.concatenate(([0], hy, [0]))
plt.plot(hhxc, hhy, drawstyle='steps-mid', label='sample')
_ = plt.axis(ymax=30)
# unfortunately producing a corresponding filled histogram is very messy :-/