import pandas as pd
import statsmodels.formula.api as sm
import numpy as np
import ggplot as gg
anscombe=pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/anscombe.csv")
anscombe
Unnamed: 0 | x1 | x2 | x3 | x4 | y1 | y2 | y3 | y4 | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 10 | 10 | 10 | 8 | 8.04 | 9.14 | 7.46 | 6.58 |
1 | 2 | 8 | 8 | 8 | 8 | 6.95 | 8.14 | 6.77 | 5.76 |
2 | 3 | 13 | 13 | 13 | 8 | 7.58 | 8.74 | 12.74 | 7.71 |
3 | 4 | 9 | 9 | 9 | 8 | 8.81 | 8.77 | 7.11 | 8.84 |
4 | 5 | 11 | 11 | 11 | 8 | 8.33 | 9.26 | 7.81 | 8.47 |
5 | 6 | 14 | 14 | 14 | 8 | 9.96 | 8.10 | 8.84 | 7.04 |
6 | 7 | 6 | 6 | 6 | 8 | 7.24 | 6.13 | 6.08 | 5.25 |
7 | 8 | 4 | 4 | 4 | 19 | 4.26 | 3.10 | 5.39 | 12.50 |
8 | 9 | 12 | 12 | 12 | 8 | 10.84 | 9.13 | 8.15 | 5.56 |
9 | 10 | 7 | 7 | 7 | 8 | 4.82 | 7.26 | 6.42 | 7.91 |
10 | 11 | 5 | 5 | 5 | 8 | 5.68 | 4.74 | 5.73 | 6.89 |
anscombe=anscombe.drop('Unnamed: 0', 1)
anscombe
x1 | x2 | x3 | x4 | y1 | y2 | y3 | y4 | |
---|---|---|---|---|---|---|---|---|
0 | 10 | 10 | 10 | 8 | 8.04 | 9.14 | 7.46 | 6.58 |
1 | 8 | 8 | 8 | 8 | 6.95 | 8.14 | 6.77 | 5.76 |
2 | 13 | 13 | 13 | 8 | 7.58 | 8.74 | 12.74 | 7.71 |
3 | 9 | 9 | 9 | 8 | 8.81 | 8.77 | 7.11 | 8.84 |
4 | 11 | 11 | 11 | 8 | 8.33 | 9.26 | 7.81 | 8.47 |
5 | 14 | 14 | 14 | 8 | 9.96 | 8.10 | 8.84 | 7.04 |
6 | 6 | 6 | 6 | 8 | 7.24 | 6.13 | 6.08 | 5.25 |
7 | 4 | 4 | 4 | 19 | 4.26 | 3.10 | 5.39 | 12.50 |
8 | 12 | 12 | 12 | 8 | 10.84 | 9.13 | 8.15 | 5.56 |
9 | 7 | 7 | 7 | 8 | 4.82 | 7.26 | 6.42 | 7.91 |
10 | 5 | 5 | 5 | 8 | 5.68 | 4.74 | 5.73 | 6.89 |
np.mean(anscombe)
x1 9.000000 x2 9.000000 x3 9.000000 x4 9.000000 y1 7.500909 y2 7.500909 y3 7.500000 y4 7.500909 dtype: float64
np.std(anscombe)
x1 3.162278 x2 3.162278 x3 3.162278 x4 3.162278 y1 1.937024 y2 1.937109 y3 1.935933 y4 1.936081 dtype: float64
result1 = sm.ols(formula="y1 ~ x1 ", data=anscombe).fit()
result1.summary()
/home/ajay/anaconda3/lib/python3.4/site-packages/scipy/stats/stats.py:1285: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=11 "anyway, n=%i" % int(n))
Dep. Variable: | y1 | R-squared: | 0.667 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.629 |
Method: | Least Squares | F-statistic: | 17.99 |
Date: | Thu, 07 Jul 2016 | Prob (F-statistic): | 0.00217 |
Time: | 04:32:15 | Log-Likelihood: | -16.841 |
No. Observations: | 11 | AIC: | 37.68 |
Df Residuals: | 9 | BIC: | 38.48 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | 3.0001 | 1.125 | 2.667 | 0.026 | 0.456 5.544 |
x1 | 0.5001 | 0.118 | 4.241 | 0.002 | 0.233 0.767 |
Omnibus: | 0.082 | Durbin-Watson: | 3.212 |
---|---|---|---|
Prob(Omnibus): | 0.960 | Jarque-Bera (JB): | 0.289 |
Skew: | -0.122 | Prob(JB): | 0.865 |
Kurtosis: | 2.244 | Cond. No. | 29.1 |
dir(result1)
['HC0_se', 'HC1_se', 'HC2_se', 'HC3_se', '_HCCM', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_cache', '_data_attr', '_get_robustcov_results', '_is_nested', '_wexog_singular_values', 'aic', 'bic', 'bse', 'centered_tss', 'compare_f_test', 'compare_lm_test', 'compare_lr_test', 'condition_number', 'conf_int', 'conf_int_el', 'cov_HC0', 'cov_HC1', 'cov_HC2', 'cov_HC3', 'cov_kwds', 'cov_params', 'cov_type', 'df_model', 'df_resid', 'diagn', 'eigenvals', 'el_test', 'ess', 'f_pvalue', 'f_test', 'fittedvalues', 'fvalue', 'get_influence', 'get_robustcov_results', 'initialize', 'k_constant', 'llf', 'load', 'model', 'mse_model', 'mse_resid', 'mse_total', 'nobs', 'normalized_cov_params', 'outlier_test', 'params', 'predict', 'pvalues', 'remove_data', 'resid', 'resid_pearson', 'rsquared', 'rsquared_adj', 'save', 'scale', 'ssr', 'summary', 'summary2', 't_test', 'tvalues', 'uncentered_tss', 'use_t', 'wald_test', 'wresid']
result1.params
Intercept 3.000091 x1 0.500091 dtype: float64
result1.rsquared
0.66654245950877489
result2 = sm.ols(formula="y2 ~ x2 ", data=anscombe).fit()
result3 = sm.ols(formula="y3 ~ x3 ", data=anscombe).fit()
result4 = sm.ols(formula="y4 ~ x4 ", data=anscombe).fit()
print(result1.params)
print(result2.params)
print(result3.params)
print(result4.params)
Intercept 3.000091 x1 0.500091 dtype: float64 Intercept 3.000909 x2 0.500000 dtype: float64 Intercept 3.002455 x3 0.499727 dtype: float64 Intercept 3.001727 x4 0.499909 dtype: float64
print(result1.rsquared)
print(result2.rsquared)
print(result3.rsquared)
print(result4.rsquared)
0.666542459509 0.666242033727 0.666324041067 0.666707256898
print(np.mean(anscombe))
x1 9.000000 x2 9.000000 x3 9.000000 x4 9.000000 y1 7.500909 y2 7.500909 y3 7.500000 y4 7.500909 dtype: float64
print(np.std(anscombe))
x1 3.162278 x2 3.162278 x3 3.162278 x4 3.162278 y1 1.937024 y2 1.937109 y3 1.935933 y4 1.936081 dtype: float64
It seems that X and Y have the same means, same standard deviations, and same regression parameters, and same R squared value (upto two decimal places). So as per summary statistics the data between all four quartets ( X1 Y1, X2 Y2, X3 Y3, X4 Y4) is the same.
%matplotlib inline
p = gg.ggplot(gg.aes(x='x1', y='y1'), data=anscombe)
p + gg.geom_point()
/home/ajay/anaconda3/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. warnings.warn(self.msg_depr % (key, alt_key))
<ggplot: (-901764730)>
p2 = gg.ggplot(gg.aes(x='x2', y='y2'), data=anscombe)
p2 + gg.geom_point()
/home/ajay/anaconda3/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. warnings.warn(self.msg_depr % (key, alt_key))
<ggplot: (-901793152)>
p3 = gg.ggplot(gg.aes(x='x3', y='y3'), data=anscombe)
p3 + gg.geom_point()
/home/ajay/anaconda3/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. warnings.warn(self.msg_depr % (key, alt_key))
<ggplot: (-901915866)>
p4= gg.ggplot(gg.aes(x='x4', y='y4'), data=anscombe)
p4 + gg.geom_point()
/home/ajay/anaconda3/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. warnings.warn(self.msg_depr % (key, alt_key))
<ggplot: (-901651556)>
The graphs show that the four quartets are completely different even though summary statistics ( means, deviations, regression) was showing identical result