import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
from sklearn.linear_model import LinearRegression
from scipy import stats
data_str = '''Region Alcohol Tobacco
North 6.47 4.03
Yorkshire 6.13 3.76
Northeast 6.19 3.77
East_Midlands 4.89 3.34
West_Midlands 5.63 3.47
East_Anglia 4.52 2.92
Southeast 5.89 3.20
Southwest 4.79 2.71
Wales 5.27 3.53
Scotland 6.08 4.51
Northern_Ireland 4.02 4.56'''
# Read in the data. Note that for Python 2.x, you have to change the "import" statement
from io import StringIO
df = pd.read_csv(StringIO(data_str), sep=r'\s+')
# Plot the data
df.plot('Tobacco', 'Alcohol', style='o')
plt.ylabel('Alcohol')
plt.title('Sales in Several UK Regions')
#plt.show()
<matplotlib.text.Text at 0x846ccd0>
result = sm.ols('Alcohol ~ Tobacco', df[:-1]).fit()
print(result.summary())
OLS Regression Results ============================================================================== Dep. Variable: Alcohol R-squared: 0.615 Model: OLS Adj. R-squared: 0.567 Method: Least Squares F-statistic: 12.78 Date: Sun, 01 Nov 2015 Prob (F-statistic): 0.00723 Time: 18:30:14 Log-Likelihood: -4.9998 No. Observations: 10 AIC: 14.00 Df Residuals: 8 BIC: 14.60 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------ Intercept 2.0412 1.001 2.038 0.076 -0.268 4.350 Tobacco 1.0059 0.281 3.576 0.007 0.357 1.655 ============================================================================== Omnibus: 2.542 Durbin-Watson: 1.975 Prob(Omnibus): 0.281 Jarque-Bera (JB): 0.904 Skew: -0.014 Prob(JB): 0.636 Kurtosis: 1.527 Cond. No. 27.2 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
C:\WinPython-32bit-3.4.3.6\python-3.4.3\lib\site-packages\scipy\stats\stats.py:1285: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=10 "anyway, n=%i" % int(n))
N = result.nobs
k = result.df_model+1
dfm, dfe = k-1, N - k
F = result.mse_model / result.mse_resid
p = 1.0 - stats.f.cdf(F,dfm,dfe)
print('F-statistic: {:.3f}, p-value: {:.5f}'.format( F, p ))
F-statistic: 12.785, p-value: 0.00723
N = result.nobs
SSR = result.ssr
s2 = SSR / N
L = ( 1.0/np.sqrt(2*np.pi*s2) ) ** N * np.exp( -SSR/(s2*2.0) )
print('ln(L) =', np.log( L ))
ln(L) = -4.99975869739
print(result.params)
Intercept 2.041223 Tobacco 1.005896 dtype: float64
# Standard Errors
df['Eins'] = np.ones(( len(df), ))
Y = df.Alcohol[:-1]
X = df[['Tobacco','Eins']][:-1]
X = df.Tobacco[:-1]
# add a column of ones for the constant intercept term
X = np.vstack(( np.ones(X.size), X ))
# convert the NumPy arrray to matrix
X = np.matrix( X )
# perform the matrix multiplication,
# and then take the inverse
C = np.linalg.inv( X * X.T )
# multiply by the MSE of the residual
C *= result.mse_resid
# take the square root
SE = np.sqrt(C)
print(SE)
[[ 1.00136021 nan] [ nan 0.28132158]]
i = 1
beta = result.params[i]
se = SE[i,i]
t = beta / se
print('t =', t)
t = 3.57560845424
N = result.nobs
k = result.df_model + 1
dof = N - k
p_onesided = 1.0 - stats.t( dof ).cdf( t )
p = p_onesided * 2.0
print('p = {0:.3f}'.format(p))
p = 0.007
i = 0
# the estimated coefficient, and its variance
beta, c = result.params[i], SE[i,i]
# critical value of the t-statistic
N = result.nobs
P = result.df_model
dof = N - P - 1
z = stats.t( dof ).ppf(0.975)
# the confidence interval
print(beta - z * c, beta + z * c)
-0.267917709371 4.35036388305
d = Y - result.fittedvalues
S = np.mean( d**3.0 ) / np.mean( d**2.0 )**(3.0/2.0)
K = np.mean( d**4.0 ) / np.mean( d**2.0 )**(4.0/2.0)
print('Skewness: {:.3f}, Kurtosis: {:.3f}'.format( S, K ))
Skewness: -0.014, Kurtosis: 1.527
def Z1( s, n ):
Y = s * np.sqrt( ( ( n + 1 )*( n + 3 ) ) / ( 6.0 * ( n - 2.0 ) ) )
b = 3.0 * ( n**2.0 + 27.0*n - 70 )*( n + 1.0 )*( n + 3.0 )
b /= ( n - 2.0 )*( n + 5.0 )*( n + 7.0 )*( n + 9.0 )
W2 = - 1.0 + np.sqrt( 2.0 * ( b - 1.0 ) )
alpha = np.sqrt( 2.0 / ( W2 - 1.0 ) )
z = 1.0 / np.sqrt( np.log( np.sqrt( W2 ) ) )
z *= np.log( Y / alpha + np.sqrt( ( Y / alpha )**2.0 + 1.0 ) )
return z
def Z2( k, n ):
E = 3.0 * ( n - 1.0 ) / ( n + 1.0 )
v = 24.0 * n * ( n - 2.0 )*( n - 3.0 )
v /= ( n + 1.0 )**2.0*( n + 3.0 )*( n + 5.0 )
X = ( k - E ) / np.sqrt( v )
b = ( 6.0 * ( n**2.0 - 5.0*n + 2.0 ) ) / ( ( n + 7.0 )*( n + 9.0 ) )
b *= np.sqrt( ( 6.0 * ( n + 3.0 )*( n + 5.0 ) ) / ( n * ( n - 2.0 )*( n - 3.0 ) ) )
A = 6.0 + ( 8.0 / b )*( 2.0 / b + np.sqrt( 1.0 + 4.0 / b**2.0 ) )
z = ( 1.0 - 2.0 / A ) / ( 1.0 + X * np.sqrt( 2.0 / ( A - 4.0 ) ) )
z = ( 1.0 - 2.0 / ( 9.0 * A ) ) - z**(1.0/3.0)
z /= np.sqrt( 2.0 / ( 9.0 * A ) )
return z
K2 = Z1( S, N )**2.0 + Z2( K, N )**2.0
print('Omnibus: {}'.format( K2))
p = 1.0 - stats.chi2(2).cdf( K2 )
print('Pr( Omnibus ) = {}'.format( p ))
(K2, p) = stats.normaltest(result.resid)
print('Omnibus: {0}, p = {1}'.format(K2, p))
Omnibus: 2.5418981690649516 Pr( Omnibus ) = 0.2805652152710648 Omnibus: 2.5418981690649542, p = 0.28056521527106454
C:\WinPython-32bit-3.4.3.6\python-3.4.3\lib\site-packages\scipy\stats\stats.py:1285: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=10 "anyway, n=%i" % int(n))
DW = np.sum( np.diff( result.resid.values )**2.0 ) / result.ssr
print('Durbin-Watson: {:.5f}'.format( DW ))
Durbin-Watson: 1.97535
JB = (N/6.0) * ( S**2.0 + (1.0/4.0)*( K - 3.0 )**2.0 )
p = 1.0 - stats.chi2(2).cdf(JB)
print('JB-statistic: {:.5f}, p-value: {:.5f}'.format( JB, p ))
JB-statistic: 0.90421, p-value: 0.63629
X = df.Tobacco[:-1]
# add a column of ones for the constant intercept term
X = np.vstack(( X, np.ones( X.size ) ))
# convert the NumPy arrray to matrix
X = np.matrix( X )
EV = np.linalg.eig( X * X.T )
print(EV)
(array([ 136.51527115, 0.18412885]), matrix([[ 0.96332746, -0.26832855], [ 0.26832855, 0.96332746]]))
CN = np.sqrt( EV[0].max() / EV[0].min() )
print('Condition No.: {:.5f}'.format( CN ))
Condition No.: 27.22887