%matplotlib inline from collections import namedtuple import itertools import matplotlib.pyplot as plt import seaborn as sns import numpy as np import datetime as dt import scipy.stats as stats from scipy.interpolate import interp1d import scipy import statsmodels.api as sm from sumrv import SumRv bigfontsize=20 labelfontsize=16 tickfontsize=16 sns.set_context('talk') plt.rcParams.update({'font.size': bigfontsize, 'axes.labelsize':labelfontsize, 'xtick.labelsize':tickfontsize, 'ytick.labelsize':tickfontsize, 'legend.fontsize':tickfontsize, }) simple = stats.uniform(loc=2,scale=3) errscale = 0.25 err = stats.norm(loc=0,scale=errscale) # NB Kernel support array **MUST** be symmetric about centre of the kernel (error PDF) for this to work right. # Support also needs to extend about any significant areas of the component PDFs. # Here, we just define one massive support for both input PDF, and error PDF (kernel) # But we can do much better (see later) #NB step-size determines precision of approximation delta = 1e-4 big_grid = np.arange(-10,10,delta) # Cannot analytically convolve continuous PDFs, in general. # So we now make a probability mass function on a fine grid # - a discrete approximation to the PDF, amenable to FFT... pmf1 = simple.pdf(big_grid)*delta pmf2 = err.pdf(big_grid)*delta conv_pmf = scipy.signal.fftconvolve(pmf1,pmf2,'same') # Convolved probability mass function print "Grid length, sum(gauss_pmf), sum(uni_pmf),sum(conv_pmf):" print len(big_grid), sum(err.pdf(big_grid)*delta), sum(simple.pdf(big_grid)*delta), sum(conv_pmf) conv_pmf = conv_pmf/sum(conv_pmf) plt.plot(big_grid,pmf1, label='Tophat') plt.plot(big_grid,pmf2, label='Gaussian error') plt.plot(big_grid,conv_pmf, label='Sum') plt.xlim(-3,max(big_grid)) plt.legend(loc='best'), plt.suptitle('PMFs') print "FFT timings:" %timeit conv_pmf = scipy.signal.fftconvolve(pmf1,pmf2,'same') # Convolved probability mass function #print "Non-FFT timings:" ## Will take a while... #%timeit slow_conv_pmf = scipy.signal.convolve(pmf1,pmf2,'same') # Convolved probability mass function sum_rv_delta_size = 1e-2 sum_rv = SumRv(simple, err, delta=sum_rv_delta_size, dist_truncation=0.0) new_grid = np.linspace(-2,7,2**10) plt.plot(new_grid,simple.pdf(new_grid), label='Tophat') plt.plot(new_grid,err.pdf(new_grid), label='Gaussian error') plt.plot(new_grid,sum_rv.pdf(new_grid), label='Sum') #plt.xlim(-3,8) plt.legend(loc='best'), plt.suptitle('Discretization and convolution via scipy routines') plt.plot(new_grid, simple.cdf(new_grid), label='Tophat') plt.plot(new_grid, err.cdf(new_grid), label='Gaussian error') plt.plot(new_grid, sum_rv.cdf(new_grid), label='Sum') plt.xlim(min(new_grid),max(new_grid)) plt.legend(loc='best'), plt.suptitle('CDFs') plt.ylim(-0.1,1.1) # Demonstrate use of KDEUnivariate to plot kernel shape, # by running on single sample point: sample=np.zeros(1) print "A single sample for our KDE:", sample kernel = sm.nonparametric.KDEUnivariate(sample) sigma=2.5 kernel.fit(bw=sigma) # kernel='gau' by default grid = np.linspace(-5*sigma,5*sigma,2**10) plt.plot(grid, kernel.evaluate(grid), label='KDE') #Plot a scipy.stats Gaussian for comparison: norm = stats.norm(loc=0,scale=sigma) plt.plot(grid, norm.pdf(grid), label='Normal') plt.legend() plt.gcf().suptitle('Gaussian KDE profile of a single sample') print "Max diff between KDE fit and Gaussian PDF:",max(kernel.evaluate(grid) - norm.pdf(grid)) #%timeit kernel.evaluate(grid) #%timeit norm.pdf(grid) #Generate regular samples of input PDF, weighted by PDF value: x_dist = stats.uniform(loc=2,scale=3) w_kde_delta=sum_rv_delta_size*2 sample_locs = np.arange(1,6,w_kde_delta) print "Weighted KDE sample spacing:", w_kde_delta, ", resulting in", len(sample_locs), "samples." sample_weights = x_dist.pdf(sample_locs) weighted_kde = sm.nonparametric.KDEUnivariate(sample_locs) weighted_kde.fit(weights=sample_weights, bw=errscale,fft=False, kernel='gau') plt.plot(grid,simple.pdf(grid), label='uniform') plt.plot(grid,err.pdf(grid), label='err') plt.plot(weighted_kde.support,weighted_kde.density, label='kde') # plt.plot(grid,weighted_kde.evaluate(grid), label='kde') plt.xlim(-2,8) plt.legend(loc='best'), plt.suptitle('Importance sampling and statsmodels weighted-KDE') # weighted_kde.support x_sample = x_dist.rvs(size=len(sample_locs)*5) fft_kde = sm.nonparametric.KDEUnivariate(x_sample) fft_kde.fit(bw=errscale,fft=True) plt.plot(grid,simple.pdf(grid), label='uniform') plt.plot(grid,err.pdf(grid), label='err') plt.plot(grid,fft_kde.evaluate(grid), label='kde') plt.xlim(-2,8) plt.legend(loc='best'), plt.suptitle('Basic MC-sampling and statsmodels FFT-KDE') %%timeit SumRv(simple, err, delta=sum_rv_delta_size) %%timeit weighted_kde = sm.nonparametric.KDEUnivariate(sample_locs) weighted_kde.fit(weights=sample_weights, bw=errscale,fft=False) %%timeit fft_kde = sm.nonparametric.KDEUnivariate(x_sample) fft_kde.fit(bw=errscale,fft=True) %timeit sum_rv.pdf(grid) %timeit weighted_kde.evaluate(grid) %timeit fft_kde.evaluate(grid) %%timeit SumRv(simple, err, delta=sum_rv_delta_size) sum_rv.pdf(grid) %%timeit weighted_kde = sm.nonparametric.KDEUnivariate(sample_locs) weighted_kde.fit(weights=sample_weights, bw=errscale,fft=False) weighted_kde.evaluate(grid) %%timeit fft_kde = sm.nonparametric.KDEUnivariate(x_sample) fft_kde.fit(bw=errscale,fft=True) fft_kde.evaluate(grid)