%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import math
from __future__ import division
# turn of data table rendering
pd.set_option('display.notebook_repr_html', False)
sns.set_palette(['#00A99D', '#F5CA0C', '#B6129F', '#76620C', '#095C57'])
np.version.full_version, scipy.version.full_version, \
pd.version.version, sm.version.full_version
('1.9.1', '0.14.0', '0.15.2', '0.6.1')
# Create a set with an obvious non-normal distribution
data = pd.Series([1,1,0,1,2,2,3,2,1,3,3,2,8,2,1,6,2,1,1,2,2,1,3,2,1,
2,3,6,2,3,2,2,1,2,2,3,2,1,2,3,2,3,2,3,1,6,1,8,1,1])
data.describe()
count 50.000000 mean 2.340000 std 1.709637 min 0.000000 25% 1.000000 50% 2.000000 75% 3.000000 max 8.000000 dtype: float64
# Plot the non-normal distribution
sns.distplot(data)
plt.show()
# The null hypothesis states the data was drawn from a normal distribution.
# It returns the w-statistic and the probability that this statistic
# came from a normal distribution (p value)
w, p = stats.shapiro(data)
w, p
(0.7413722276687622, 4.960041621870914e-08)
# Test the null hypothesis with 5% alpha
alpha = .05
p > alpha
False
Since this p value is much larger than the significance level, we reject the null hypothesis that this sample comes from a normal distribution.
# Let perform the same test on a random generated normal distribution
mu, sigma, n = 0, .1, 50
data = np.random.normal(mu, sigma, n)
data
array([-0.06305276, -0.12291651, -0.04816977, 0.05572359, 0.13725905, 0.1447485 , -0.20264179, -0.06685487, -0.0781263 , -0.00718376, -0.08506072, -0.03998016, -0.11571598, 0.12958116, 0.10986715, 0.0649146 , -0.13533652, -0.1174784 , -0.0714837 , -0.03965943, 0.01309047, 0.04284077, 0.02698651, 0.04941069, -0.01366992, -0.16790697, 0.02145848, -0.09678135, -0.07191305, 0.07613435, 0.01198686, -0.14470548, 0.15809624, -0.09929681, 0.03842638, 0.06481776, -0.13934484, 0.07156626, -0.10190243, -0.09960222, 0.21799547, -0.02453129, -0.03748973, -0.17076149, 0.07634182, -0.02514305, 0.10922111, -0.15272102, -0.13171909, -0.04938742])
# Plot the more 'normal' distribution
sns.distplot(data)
plt.show()
# The Shapiro-Wilk test tests the null hypothesis that the data
# was drawn from a normal distribution.
w, p = stats.shapiro(data)
w, p
(0.9749879240989685, 0.3645625710487366)
# Test the null hypothesis with 5% alpha
alpha = .05
p > alpha
True
Since this p value is much larger than the significance level, we retain the null hypothesis that this sample comes from a normal distribution.