import pandas as pd import pandas.rpy.common as com galton = com.load_data('galton', package='UsingR') from statsmodels.formula.api import ols lm1 = ols('child ~ parent', galton).fit() plot(galton['parent'], galton['child'], 'ob') plot(galton['parent'], lm1.fittedvalues, 'r', linewidth=3); lm1.params # emulates abline function, only possible to basic line styles and width def abline(intercept, gradient, *args, **kwargs): a = gca() xlim = a.get_xlim() ylim = a.get_ylim() if args: sty = args[0] else: sty = 'r' if kwargs: lw = kwargs['linewidth'] else: lw = 5 a.plot(xlim, [intercept + gradient * x for x in xlim], sty, linewidth=lw) a.set_xlim(xlim) a.set_ylim(ylim); parent = np.random.normal(loc=galton['parent'].mean(), scale=galton['parent'].std(), size=1e6) child = lm1.params[0] + lm1.params[1] * parent + np.random.normal(scale=lm1.resid.std(), size=1e6) newGalton = pd.DataFrame({'parent' : parent, 'child' : child}) hexbin(newGalton['parent'], newGalton['child'], cmap='PuBu') abline(lm1.params['Intercept'], lm1.params['parent'], 'r', linewidth=3); import random random.seed(134325) idx = random.sample(np.arange(1e6), 50) sampleGalton1 = newGalton.ix[idx,:] sampleLm1 = ols('child ~ parent', sampleGalton1).fit() plot(sampleGalton1['parent'], sampleGalton1['child'], 'ob') plot(sampleGalton1['parent'], sampleLm1.fittedvalues, 'k-.', linewidth=3) abline(lm1.params['Intercept'], lm1.params['parent'], 'r', linewidth=3); idx = random.sample(np.arange(1e6), 50) sampleGalton2 = newGalton.ix[idx,:] sampleLm2 = ols('child ~ parent', sampleGalton2).fit() plot(sampleGalton2['parent'], sampleGalton2['child'], 'ob') plot(sampleGalton2['parent'], sampleLm2.fittedvalues, 'k-.', linewidth=1) abline(lm1.params['Intercept'], lm1.params['parent'], 'r', linewidth=3); sampleLm = [] for i in range(100): idx = random.sample(np.arange(1e6), 50) sampleGalton = newGalton.ix[idx,:] sampleLm.append(ols('child ~ parent', sampleGalton).fit()) hexbin(newGalton['parent'], newGalton['child'], cmap='PuBu') for lm in sampleLm: abline(lm.params['Intercept'], lm.params['parent'], 'k', linewidth=1) abline(lm1.params['Intercept'], lm1.params['parent'], 'r', linewidth=3); f, (ax1, ax2) = subplots(ncols=2) ax1.hist(map(lambda x: x.params['Intercept'], sampleLm)) ax1.set_xlabel('Intercept') ax2.hist(map(lambda x: x.params['parent'], sampleLm)) ax2.set_xlabel('Slope') f.tight_layout(); idx = random.sample(np.arange(1e6), 50) sampleGalton4 = newGalton.ix[idx,:] sampleLm4 = ols('child ~ parent', sampleGalton4).fit() sampleLm4.summary() # estimated vs real distribution --- very close import scipy.stats as sst hist(map(lambda x: x.params['parent'], sampleLm), normed=True, bins=10) xlabel('Slope') x = np.linspace(0.2, 1.4, num=100) y = sst.norm.pdf(x, loc=sampleLm4.params['parent'], scale=sampleLm4.bse[1]) plot(x, y, 'r'); x = np.linspace(-5, 5, num=100) plot(x, sst.norm.pdf(x), 'k', linewidth=3) plot(x, sst.t.pdf(x, 3), 'r', linewidth=3) plot(x, sst.t.pdf(x, 10), 'b', linewidth=3); sampleLm4.params sampleLm4.conf_int() # null distribution x = np.linspace(-20, 20, num=100) plot(x, sst.t.pdf(x, (928 - 2)), linewidth=3) arr = Arrow(lm1.tvalues[1], .25, 0, -0.25, lw=3, fc='r', ec='r') ax = gca() ax.add_patch(arr); lm1.summary() lm1.pvalues