Author: Thomas Haslwanter, Feb-2017
%pylab inline
import scipy.stats as stats
# seaborn is a package for the visualization of statistical data
import seaborn as sns
sns.set(style='ticks')
Populating the interactive namespace from numpy and matplotlib
''' Different aspects of a normal distribution'''
# Generate the data
x = r_[-10:10:0.1]
rv = stats.norm(0,1) # random variate
x2 = r_[0:1:0.001]
ax = subplot2grid((3,2),(0,0), colspan=2)
plot(x,rv.pdf(x))
xlim([-10,10])
title('Normal Distribution - PDF')
subplot(323)
plot(x,rv.cdf(x))
xlim([-4,4])
title('CDF: cumulative distribution fct')
subplot(324)
plot(x,rv.sf(x))
xlim([-4,4])
title('SF: survival fct')
subplot(325)
plot(x2,rv.ppf(x2))
title('PPF')
subplot(326)
plot(x2,rv.isf(x2))
title('ISF')
tight_layout()
show()
'''PDF, scatter plot, and histogram.'''
# Generate the data
x = arange(-5,15,0.1)
# Plot a normal distribution: "Probability density functions"
myMean = 5
mySD = 2
y = normpdf(x, myMean, mySD)
# or: y = stats.norm.pdf(x, myMean, mySD)
plot(x,y)
title('Shifted Normal Distribution')
<matplotlib.text.Text at 0x27b4256e198>
numData = 500
data = stats.norm.rvs(myMean, mySD, size = numData)
plot(data, '.')
title('Normally distributed data')
show()
hist(data)
title('Histogram of normally distributed data')
<matplotlib.text.Text at 0x27b4268f6d8>
'''Show multiple samples from the same distribution, and compare means.'''
# Do this 25 times, and show the histograms
numRows = 5
numData = 100
for ii in range(numRows):
for jj in range(numRows):
data = stats.norm.rvs(myMean, mySD, size=numData)
subplot(numRows,numRows,numRows*ii+jj+1)
hist(data)
xticks([])
yticks([])
xlim(myMean-3*mySD, myMean+3*mySD)
tight_layout()
show()
# Check out the mean of 1000 normally distributded samples
numTrials = 1000;
numData = 100
myMeans = ones(numTrials)*nan
for ii in range(numTrials):
data = stats.norm.rvs(myMean, mySD, size=numData)
myMeans[ii] = mean(data)
print('The standard error of the mean, with {0} samples, is {1:5.3f}'.format(numData, std(myMeans)))
The standard error of the mean, with 100 samples, is 0.200
'''Check if the distribution is normal.'''
# Generate and show a distribution
numData = 100
data = stats.norm.rvs(myMean, mySD, size=numData)
hist(data)
(array([ 4., 6., 10., 7., 19., 15., 15., 15., 5., 4.]), array([ 0.76711319, 1.55863157, 2.35014995, 3.14166832, 3.9331867 , 4.72470508, 5.51622346, 6.30774183, 7.09926021, 7.89077859, 8.68229697]), <a list of 10 Patch objects>)
# Graphical test: if the data lie on a line, they are pretty much
# normally distributed
_ = stats.probplot(data, plot=plt)
# The scipy "normaltest" is based on D’Agostino and Pearson’s test that
# combines skew and kurtosis to produce an omnibus test of normality.
_, pVal = stats.normaltest(data)
# Or you can check for normality with Kolmogorov-Smirnov test: but this is only advisable for large sample numbers!
#_,pVal = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
if pVal > 0.05:
print('Data are probably normally distributed')
Data are probably normally distributed
'''Calculate an empirical cumulative distribution function, compare it with the exact one, and
find the exact point for a specific data value.'''
# Generate normally distributed random data
myMean = 5
mySD = 2
numData = 100
data = stats.norm.rvs(myMean, mySD, size=numData)
# Calculate the cumulative distribution function, CDF
numbins = 20
counts, bin_edges = histogram(data, bins=numbins, normed=True)
cdf = cumsum(counts)
cdf /= max(cdf)
# compare with the exact CDF
step(bin_edges[1:],cdf)
plot(x, stats.norm.cdf(x, myMean, mySD),'r')
# Find out the value corresponding to the x-th percentile: the
# "cumulative distribution function"
value = 2
myMean = 5
mySD = 2
cdf = stats.norm.cdf(value, myMean, mySD)
print(('With a threshold of {0:4.2f}, you get {1}% of the data'.format(value, round(cdf*100))))
# For the percentile corresponding to a certain value:
# the "inverse cumulative distribution function"
value = 0.025
icdf = stats.norm.isf(value, myMean, mySD)
print('To get {0}% of the data, you need a threshold of {1:4.2f}.'.format((1-value)*100, icdf))
With a threshold of 2.00, you get 7.0% of the data To get 97.5% of the data, you need a threshold of 8.92.