Importing Packages¶

In [45]:

import pandas as pd
import statsmodels.formula.api as sm
import numpy as np
import ggplot as gg

Reading the Dataset¶

In [46]:

anscombe=pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/anscombe.csv")

In [47]:

anscombe

Out[47]:

	Unnamed: 0	x1	x2	x3	x4	y1	y2	y3	y4
0	1	10	10	10	8	8.04	9.14	7.46	6.58
1	2	8	8	8	8	6.95	8.14	6.77	5.76
2	3	13	13	13	8	7.58	8.74	12.74	7.71
3	4	9	9	9	8	8.81	8.77	7.11	8.84
4	5	11	11	11	8	8.33	9.26	7.81	8.47
5	6	14	14	14	8	9.96	8.10	8.84	7.04
6	7	6	6	6	8	7.24	6.13	6.08	5.25
7	8	4	4	4	19	4.26	3.10	5.39	12.50
8	9	12	12	12	8	10.84	9.13	8.15	5.56
9	10	7	7	7	8	4.82	7.26	6.42	7.91
10	11	5	5	5	8	5.68	4.74	5.73	6.89

Dropping the column¶

In [48]:

anscombe=anscombe.drop('Unnamed: 0', 1) 

The Anscombe Quartet¶

In [49]:

anscombe

Out[49]:

	x1	x2	x3	x4	y1	y2	y3	y4
0	10	10	10	8	8.04	9.14	7.46	6.58
1	8	8	8	8	6.95	8.14	6.77	5.76
2	13	13	13	8	7.58	8.74	12.74	7.71
3	9	9	9	8	8.81	8.77	7.11	8.84
4	11	11	11	8	8.33	9.26	7.81	8.47
5	14	14	14	8	9.96	8.10	8.84	7.04
6	6	6	6	8	7.24	6.13	6.08	5.25
7	4	4	4	19	4.26	3.10	5.39	12.50
8	12	12	12	8	10.84	9.13	8.15	5.56
9	7	7	7	8	4.82	7.26	6.42	7.91
10	5	5	5	8	5.68	4.74	5.73	6.89

Taking means and standard deviations¶

In [50]:

np.mean(anscombe)

Out[50]:

x1    9.000000
x2    9.000000
x3    9.000000
x4    9.000000
y1    7.500909
y2    7.500909
y3    7.500000
y4    7.500909
dtype: float64

In [51]:

np.std(anscombe)

Out[51]:

x1    3.162278
x2    3.162278
x3    3.162278
x4    3.162278
y1    1.937024
y2    1.937109
y3    1.935933
y4    1.936081
dtype: float64

Fitting Regression Line Between Respective X and Y¶

In [52]:

result1 = sm.ols(formula="y1 ~ x1 ", data=anscombe).fit()
result1.summary()
 

/home/ajay/anaconda3/lib/python3.4/site-packages/scipy/stats/stats.py:1285: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=11
  "anyway, n=%i" % int(n))

Out[52]:

OLS Regression Results
Dep. Variable:	y1	R-squared:	0.667
Model:	OLS	Adj. R-squared:	0.629
Method:	Least Squares	F-statistic:	17.99
Date:	Thu, 07 Jul 2016	Prob (F-statistic):	0.00217
Time:	04:32:15	Log-Likelihood:	-16.841
No. Observations:	11	AIC:	37.68
Df Residuals:	9	BIC:	38.48
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	3.0001	1.125	2.667	0.026	0.456 5.544
x1	0.5001	0.118	4.241	0.002	0.233 0.767

Omnibus:	0.082	Durbin-Watson:	3.212
Prob(Omnibus):	0.960	Jarque-Bera (JB):	0.289
Skew:	-0.122	Prob(JB):	0.865
Kurtosis:	2.244	Cond. No.	29.1

In [53]:

dir(result1)

Out[53]:

['HC0_se',
 'HC1_se',
 'HC2_se',
 'HC3_se',
 '_HCCM',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_cache',
 '_data_attr',
 '_get_robustcov_results',
 '_is_nested',
 '_wexog_singular_values',
 'aic',
 'bic',
 'bse',
 'centered_tss',
 'compare_f_test',
 'compare_lm_test',
 'compare_lr_test',
 'condition_number',
 'conf_int',
 'conf_int_el',
 'cov_HC0',
 'cov_HC1',
 'cov_HC2',
 'cov_HC3',
 'cov_kwds',
 'cov_params',
 'cov_type',
 'df_model',
 'df_resid',
 'diagn',
 'eigenvals',
 'el_test',
 'ess',
 'f_pvalue',
 'f_test',
 'fittedvalues',
 'fvalue',
 'get_influence',
 'get_robustcov_results',
 'initialize',
 'k_constant',
 'llf',
 'load',
 'model',
 'mse_model',
 'mse_resid',
 'mse_total',
 'nobs',
 'normalized_cov_params',
 'outlier_test',
 'params',
 'predict',
 'pvalues',
 'remove_data',
 'resid',
 'resid_pearson',
 'rsquared',
 'rsquared_adj',
 'save',
 'scale',
 'ssr',
 'summary',
 'summary2',
 't_test',
 'tvalues',
 'uncentered_tss',
 'use_t',
 'wald_test',
 'wresid']

In [54]:

result1.params

Out[54]:

Intercept    3.000091
x1           0.500091
dtype: float64

In [55]:

result1.rsquared

Out[55]:

0.66654245950877489

In [56]:

result2 = sm.ols(formula="y2 ~ x2 ", data=anscombe).fit()
result3 = sm.ols(formula="y3 ~ x3 ", data=anscombe).fit()
result4 = sm.ols(formula="y4 ~ x4 ", data=anscombe).fit()

In [57]:

print(result1.params)
print(result2.params)
print(result3.params)
print(result4.params)

Intercept    3.000091
x1           0.500091
dtype: float64
Intercept    3.000909
x2           0.500000
dtype: float64
Intercept    3.002455
x3           0.499727
dtype: float64
Intercept    3.001727
x4           0.499909
dtype: float64

In [58]:

print(result1.rsquared)
print(result2.rsquared)
print(result3.rsquared)
print(result4.rsquared)

0.666542459509
0.666242033727
0.666324041067
0.666707256898

In [59]:

print(np.mean(anscombe))

x1    9.000000
x2    9.000000
x3    9.000000
x4    9.000000
y1    7.500909
y2    7.500909
y3    7.500000
y4    7.500909
dtype: float64

In [60]:

print(np.std(anscombe))

x1    3.162278
x2    3.162278
x3    3.162278
x4    3.162278
y1    1.937024
y2    1.937109
y3    1.935933
y4    1.936081
dtype: float64

Conclusion¶

It seems that X and Y have the same means, same standard deviations, and same regression parameters, and same R squared value (upto two decimal places). So as per summary statistics the data between all four quartets ( X1 Y1, X2 Y2, X3 Y3, X4 Y4) is the same.

Data Visualization¶

In [61]:

%matplotlib inline

In [62]:

p = gg.ggplot(gg.aes(x='x1', y='y1'), data=anscombe)
p + gg.geom_point()

/home/ajay/anaconda3/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

Out[62]:

<ggplot: (-901764730)>

In [63]:

p2 = gg.ggplot(gg.aes(x='x2', y='y2'), data=anscombe)
p2 + gg.geom_point()

/home/ajay/anaconda3/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

Out[63]:

<ggplot: (-901793152)>

In [64]:

p3 = gg.ggplot(gg.aes(x='x3', y='y3'), data=anscombe)
p3 + gg.geom_point()

/home/ajay/anaconda3/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

Out[64]:

<ggplot: (-901915866)>

In [65]:

p4= gg.ggplot(gg.aes(x='x4', y='y4'), data=anscombe)
p4 + gg.geom_point()

/home/ajay/anaconda3/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

Out[65]:

<ggplot: (-901651556)>

Conclusion¶

The graphs show that the four quartets are completely different even though summary statistics ( means, deviations, regression) was showing identical result