In [1]:

%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import math
from __future__ import division

# turn of data table rendering
pd.set_option('display.notebook_repr_html', False)

sns.set_palette(['#00A99D', '#F5CA0C', '#B6129F', '#76620C', '#095C57'])
np.version.full_version, scipy.version.full_version, \
pd.version.version, sm.version.full_version

Out[1]:

('1.9.1', '0.14.0', '0.15.2', '0.6.1')

Non-normal distribution¶

In [2]:

# Create a set with an obvious non-normal distribution
data = pd.Series([1,1,0,1,2,2,3,2,1,3,3,2,8,2,1,6,2,1,1,2,2,1,3,2,1,
                  2,3,6,2,3,2,2,1,2,2,3,2,1,2,3,2,3,2,3,1,6,1,8,1,1])
data.describe()

Out[2]:

count    50.000000
mean      2.340000
std       1.709637
min       0.000000
25%       1.000000
50%       2.000000
75%       3.000000
max       8.000000
dtype: float64

In [3]:

# Plot the non-normal distribution
sns.distplot(data)
plt.show()

In [4]:

# The null hypothesis states the data was drawn from a normal distribution. 
# It returns the w-statistic and the probability that this statistic 
# came from a normal distribution (p value)
w, p = stats.shapiro(data)
w, p

Out[4]:

(0.7413722276687622, 4.960041621870914e-08)

In [5]:

# Test the null hypothesis with 5% alpha
alpha = .05
p > alpha

Out[5]:

False

Since this p value is much larger than the significance level, we reject the null hypothesis that this sample comes from a normal distribution.

Normal distribution¶

In [6]:

# Let perform the same test on a random generated normal distribution
mu, sigma, n = 0, .1, 50
data = np.random.normal(mu, sigma, n)
data

Out[6]:

array([-0.06305276, -0.12291651, -0.04816977,  0.05572359,  0.13725905,
        0.1447485 , -0.20264179, -0.06685487, -0.0781263 , -0.00718376,
       -0.08506072, -0.03998016, -0.11571598,  0.12958116,  0.10986715,
        0.0649146 , -0.13533652, -0.1174784 , -0.0714837 , -0.03965943,
        0.01309047,  0.04284077,  0.02698651,  0.04941069, -0.01366992,
       -0.16790697,  0.02145848, -0.09678135, -0.07191305,  0.07613435,
        0.01198686, -0.14470548,  0.15809624, -0.09929681,  0.03842638,
        0.06481776, -0.13934484,  0.07156626, -0.10190243, -0.09960222,
        0.21799547, -0.02453129, -0.03748973, -0.17076149,  0.07634182,
       -0.02514305,  0.10922111, -0.15272102, -0.13171909, -0.04938742])

In [7]:

# Plot the more 'normal' distribution
sns.distplot(data)
plt.show()

In [8]:

# The Shapiro-Wilk test tests the null hypothesis that the data 
# was drawn from a normal distribution.
w, p = stats.shapiro(data)
w, p

Out[8]:

(0.9749879240989685, 0.3645625710487366)

In [9]:

# Test the null hypothesis with 5% alpha
alpha = .05
p > alpha

Out[9]:

True

Since this p value is much larger than the significance level, we retain the null hypothesis that this sample comes from a normal distribution.