import pandas as pd import matplotlib.pyplot as plt import rpy2 from IPython.display import YouTubeVideo %load_ext rpy2.ipython %matplotlib inline heig = pd.DataFrame.from_csv('data/Galton.csv', index_col=None) heig = pd.melt(heig, value_vars = ['child', 'parent'], value_name = 'height') %%R -i heig require(ggplot2) df <- as.data.frame(heig) g <- ggplot(heig, aes(height, fill=variable)) + geom_histogram(binwidth=1) + theme_bw() g <- g + facet_grid(~variable) g child = heig[heig['variable'] == 'child'] import numpy as np child.head() hrange = np.arange(child['height'].min(), child['height'].max(), .5) MSE = lambda x: ((child.height - x)**2).mean() MSEdf = pd.DataFrame([(_, MSE(_)) for _ in hrange], columns = ['mu', 'MSE']) pd.DataFrame.plot(MSEdf, x = 'mu', y='MSE', title='Minimizing Mu'); child.mean() # reimport so I don't have to unmelt heig = pd.DataFrame.from_csv('data/Galton.csv', index_col=None) # get counts of values for the bubble plot h = pd.DataFrame({'count' : heig.groupby( [ "child", "parent"] ).size()}).reset_index() h[:10] %%R -i h require(ggplot2) require(hexbin) df <- as.data.frame(h) g <- ggplot(df, aes(parent, child, size=count)) + geom_point() + ggtitle("Bubble Plot of Height") g + theme_bw() col = 'child' h[col] = h[col] - h[col].mean() col = 'parent' h[col] = h[col] - h[col].mean() %%R -i h require(ggplot2) require(hexbin) df <- as.data.frame(h) g <- ggplot(df, aes(parent, child, size=count)) + geom_point() + ggtitle("Bubble Plot Height De-meaned") + theme_bw() g2 <- geom_smooth(data=df, aes(parent, child, weight=count), colour='red', method=lm, se=FALSE) g3 <- geom_smooth(data=df, aes(parent, child), colour='green', method=lm, se=FALSE) g4 <- geom_smooth(data=df, aes(parent, child), colour='blue', method=loess, se=FALSE) g + g2 + g3 + g4 + theme(legend.position="none") col = 'child' heig[col] = heig[col] - heig[col].mean() col = 'parent' heig[col] = heig[col] - heig[col].mean() MSE = lambda beta: ((heig['child'] - heig['parent'] * beta)**2).mean() # try slopes between -3 and 3 betas = np.arange(-3, 3, .01) df = pd.DataFrame([(beta, MSE(beta),) for beta in betas], columns = ['beta', 'MSE']) df.plot(x = 'beta', y='MSE', title = 'finding the best Beta empirically'); min_beta = float(df[df['MSE']==df['MSE'].min()]['beta']) df[df['MSE']==df['MSE'].min()] float(min_beta) from scipy.stats import linregress slope, intercept, rvalue, pvalue, stderr = linregress(heig['parent'], heig['child']) slope MSE(slope) import statsmodels.formula.api as sm result = sm.ols(formula="child ~ parent", data=heig).fit() heig = pd.DataFrame.from_csv('data/Galton.csv', index_col=None) heig.head() y = 'child' x = 'parent' beta1 = heig[y].corr(heig[x]) * heig[y].std() / heig[x].std() beta0 = heig[y].mean() - beta1 * heig[x].mean() beta0 beta1 result = sm.ols(formula="child ~ parent", data=heig).fit() result.params # q1 - what's a better way to do this? x = [0.18, -1.54, 0.42, 0.95] w = [2, 1, 3, 1] df = pd.DataFrame([x, w]).T df.columns = ['x', 'w'] df square = lambda x: x**2 f1 = lambda mu: (df['w'] * (df['x'] - mu).apply(square)).sum() mus = np.arange(0, 1, .01) f1(.18) mus = pd.DataFrame([(mu, f1(mu),) for mu in mus], columns = ['mu', 'fx']) pd.DataFrame.plot(mus, x = 'mu', y='fx', title='Minimizing Mu'); mus[mus['fx']==mus['fx'].min()] # q2 x = [0.8, 0.47, 0.51, 0.73, 0.36, 0.58, 0.57, 0.85, 0.44, 0.42] y = [1.39, 0.72, 1.55, 0.48, 1.19, -1.59, 1.23, -0.65, 1.49, 0.05] df = pd.DataFrame([x, y]).T df.columns = ['x', 'y'] df.head() #result = sm.ols(formula="child ~ parent", data=heig).fit() # y intercept set to zero result = sm.ols(formula="y ~ x - 1", data=df).fit() result.summary() x = 'x' y = 'y' num = (df[x] * df[y]).sum() den = (df[x]).apply(square).sum() num / den %%R -i x require(dataset) data(mtcars) head(mtcars) lm(mpg~wt, data=mtcars) 1.5 *0.4 #??? x = [8.58, 10.46, 9.01, 9.64, 8.86] s = pd.Series(x) # normalized values ns = (s - s.mean())/s.std() ns ns.mean() df result = sm.ols("y ~ x", data=df).fit() result.summary() df['x'].mean() df r1 = sm.ols("x~y", data=df).fit() r2 = sm.ols("y~x", data=df).fit() r2.params.x / r1.params.y df['y'].var() / df['x'].var() %%R -o diamond require(UsingR) require(ggplot2) data(diamond) g <- ggplot(diamond, aes(carat, price)) + geom_point() + theme_bw() g <- g + geom_smooth(method=lm, se=FALSE) g formula = "price ~ carat" r1 = sm.ols(formula, data=diamond).fit() r1.summary() # same formula, but subtracting out the carat mean to get a usable intercept values f = "price ~ I(carat - carat.mean())" r2 = sm.ols(f, data=diamond).fit() # note that the slope is the same print(r1.params) print(r2.params) print(diamond.price.mean()) diamond.price.mean() # A one carat increase in diamond is pretty big. f = "price ~ I(carat *10)" r3 = sm.ols(f, data=diamond).fit() r3.params # And the slope is 1/coef we mutliplied the regressor by df = pd.DataFrame([0.16, 0.27, 0.34], columns=['carat']) df['price'] = r3.predict(df) df diamond['price_predict'] = r3.predict(diamond) %%R -i diamond g <- ggplot(diamond, aes(carat, price)) + geom_point(color='green') g <- g + geom_line(data=diamond, aes(carat, price_predict), colour='blue', linetype=2) g <- g + geom_point(data=diamond, aes(carat, price_predict), colour='red', shape=3) g + theme_bw() # green dots - actual observations # blue dashed line - predictions # red dashes - places where we made predictions. Note that these are the same x_i values where we have observed points %%R -o diamond data(diamond) y <- diamond$price x <- diamond$carat n <- length(y) fit <- lm(y~x) e <- resid(fit) yhat <- predict(fit) max(abs(e - (y-yhat))) diamond x = diamond.carat y = diamond.price results = sm.ols("price ~ carat", data=diamond).fit() e = results.resid yhat = results.predict() df = pd.DataFrame([e, y - yhat]).T df.columns = ['e', 'y - yhat'] df.head() (df.e - df['y - yhat']).abs().max() from statsmodels.api import graphics fig, ax = plt.subplots(figsize=(12, 8)) fig = graphics.plot_fit(results, "carat", ax=ax) fig, ax = plt.subplots(figsize=(12, 8)) graphics.plot_regress_exog(results, "carat", fig); # generated data set from random import uniform, normalvariate, seed from math import sin seed(123) x = [uniform(-3, 3) for _ in xrange(100)] y = [_ + sin(_) + normalvariate(mu=100, sigma=.2) for _ in x] df = pd.DataFrame([_ for _ in zip(x, y)], columns = ['x', 'y']) df = df.sort('x') plt.scatter(df.y, df.x); results = sm.ols("y ~ x", data=df).fit() # why does this show up twice?? # todo: look at the source and figure fig, ax = plt.subplots(figsize=(12, 8)) graphics.plot_regress_exog(results, "x", fig); plt.plot(df.x, results.resid, 'o'); x = [uniform(0, 6) for _ in xrange(100)] df = pd.DataFrame(x, columns = ['x']) ygen = lambda x: x + normalvariate(0, .001*x) df['y'] = [ygen(x) for x in df.x] df = df.sort('x') plt.plot(df.x, df.y, 'o'); r_hetero = sm.ols("y~x", data=df).fit() plt.plot(df.x, r_hetero.resid, 'o'); fig, ax = plt.subplots(figsize=(12, 8)) graphics.plot_regress_exog(r_hetero, "x", fig); diamond.head() %%R y <- diamond$price; x <- diamond$carat; n <- length(y) fit <- lm(y ~ x) summary(fit)$sigma sqrt(sum(resid(fit)^2) / (n - 2)) y = 'price'; x = 'carat' fit = sm.ols('%s ~ %s' % (y, x, ), data=diamond).fit() e = (fit.predict(diamond) - diamond.price)**2 from math import sqrt sigma = sqrt((e.shape[0]-2)**-1 * e.sum()) sigma # Normalized by N-1 by default. This can be changed using the ddof argument sqrt(fit.resid.var(ddof=2)) from __future__ import print_function """ Edward Tufte uses this example from Anscombe to show 4 datasets of x and y that have the same mean, standard deviation, and regression line, but which are qualitatively different. matplotlib fun for a rainy day """ from pylab import * x = array([10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]) y1 = array([8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]) y2 = array([9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74]) y3 = array([7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73]) x4 = array([8,8,8,8,8,8,8,19,8,8,8]) y4 = array([6.58,5.76,7.71,8.84,8.47,7.04,5.25,12.50,5.56,7.91,6.89]) def fit(x): return 3+0.5*x xfit = array( [amin(x), amax(x) ] ) subplot(221) plot(x,y1,'ks', xfit, fit(xfit), 'r-', lw=2) axis([2,20,2,14]) setp(gca(), xticklabels=[], yticks=(4,8,12), xticks=(0,10,20)) text(3,12, 'I', fontsize=20) subplot(222) plot(x,y2,'ks', xfit, fit(xfit), 'r-', lw=2) axis([2,20,2,14]) setp(gca(), xticklabels=[], yticks=(4,8,12), yticklabels=[], xticks=(0,10,20)) text(3,12, 'II', fontsize=20) subplot(223) plot(x,y3,'ks', xfit, fit(xfit), 'r-', lw=2) axis([2,20,2,14]) text(3,12, 'III', fontsize=20) setp(gca(), yticks=(4,8,12), xticks=(0,10,20)) subplot(224) xfit = array([amin(x4),amax(x4)]) plot(x4,y4,'ks', xfit, fit(xfit), 'r-', lw=2) axis([2,20,2,14]) setp(gca(), yticklabels=[], yticks=(4,8,12), xticks=(0,10,20)) text(3,12, 'IV', fontsize=20) #verify the stats pairs = (x,y1), (x,y2), (x,y3), (x4,y4) for x,y in pairs: print ('mean=%1.2f, std=%1.2f, r=%1.2f'%(mean(y), std(y), corrcoef(x,y)[0][1])) show() x = array([10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]) y1 = array([8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]) y2 = array([9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74]) y3 = array([7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73]) x4 = array([8,8,8,8,8,8,8,19,8,8,8]) y4 = array([6.58,5.76,7.71,8.84,8.47,7.04,5.25,12.50,5.56,7.91,6.89]) df = pd.DataFrame([x, x4, y1, y2, y3, y4]).T df.columns = ['x', 'x4', 'y1', 'y2', 'y3', 'y4'] def regress(x, y, data): formula = '%s ~ %s' % (y, x, ) return sm.ols(formula, data=data).fit() subplot(221) plt.plot(df.x, regress('x', 'y1', df).resid, 'ro'); plt.title('I'); subplot(222) plt.plot(df.x, regress('x', 'y2', df).resid, 'ro'); plt.title('II'); subplot(223) plt.plot(df.x, regress('x', 'y3', df).resid, 'ro'); plt.title('III'); subplot(224) plt.plot(df.x, regress('x4', 'y4', df).resid, 'ro'); plt.title('IV'); %%R -o diamond library(UsingR); data(diamond) y <- diamond$price; x <- diamond$carat; n <- length(y) beta1 <- cor(y, x) * sd(y) / sd(x) beta0 <- mean(y) - beta1 * mean(x) e <- y - beta0 - beta1 * x sigma <- sqrt(sum(e^2) / (n-2)) ssx <- sum((x - mean(x))^2) seBeta0 <- (1 / n + mean(x) ^ 2 / ssx) ^ .5 * sigma seBeta1 <- sigma / sqrt(ssx) tBeta0 <- beta0 / seBeta0; tBeta1 <- beta1 / seBeta1 pBeta0 <- 2 * pt(abs(tBeta0), df = n - 2, lower.tail = FALSE) pBeta1 <- 2 * pt(abs(tBeta1), df = n - 2, lower.tail = FALSE) coefTable <- rbind(c(beta0, seBeta0, tBeta0, pBeta0), c(beta1, seBeta1, tBeta1, pBeta1)) colnames(coefTable) <- c("Estimate", "Std. Error", "t value", "P(>|t|)") rownames(coefTable) <- c("(Intercept)", "x") coefTable import scipy.stats as st n = 46 st.t.pdf(abs(1), df = n - 2) x = diamond['carat'] y = diamond['price'] n = float(len(y)) beta1 = y.corr(x) * y.std() / x.std() # regression slope beta0 = y.mean() - beta1 * x.mean() # regression intercept e = y - (beta1 * x + beta0) # errors sigma = ((e**2).sum()/(n - 2))**0.5 # unbiased residual mean ssx = ((x - x.mean())**2).sum() # sum of squared Xs seBeta0 = (1/n + x.mean()**2 / ssx)**0.5 * sigma seBeta1 = sigma / ssx**0.5 tBeta0 = beta0 / seBeta0 tBeta1 = beta1 / seBeta1 # TODO: review hypothesis testing pBeta0 = 2 * (1-st.t.cdf(abs(tBeta0), df = n - 2)) pBeta1 = 2 * (1-st.t.cdf(abs(tBeta1), df = n - 2)) l = [[beta0, seBeta0, tBeta0, pBeta0], [beta1, seBeta1, tBeta1, pBeta1]] coefTable = pd.DataFrame((_ for _ in l), columns = ["Estimate", "Std. Error", "t value", "P(>|t|)"], index = ["(Intercept)", "x"]) coefTable fit = sm.ols("price ~ carat", data=diamond).fit() fit.summary() %%R data(diamond) fit <- lm("price ~ carat", data=diamond) summary(fit) print ('\n'.join(st.t.pdf.__doc__.split('\n')[:100])) import numpy as np x = np.arange(-3, 3, .01) ypdf = [st.t.pdf(_, n - 2) for _ in x] ycdf = [1 - st.t.cdf(_, n - 2) for _ in x] f, axarr = plt.subplots(2, sharex=True) axarr[0].plot(x, ypdf) axarr[0].set_title('PDF') axarr[1].plot(x, ycdf) axarr[1].set_title('CDF'); %%R -o dfR # test to figure out wtf the pt function is doing require(ggplot2) n <- 48 x <- seq(-3, 3, by=.01) y <- pt(x, df = n - 2, lower.tail = FALSE) df1 <- data.frame(x, y, 'false') names(df1) <- c('x', 'y', 'tail') y <- pt(x, df = n - 2, lower.tail = TRUE) df2 <- data.frame(x, y, 'true') names(df2) <- c('x', 'y', 'tail') dfR <- rbind(df1, df2) ggplot(dfR, aes(x, y, colour=tail)) + geom_line() + theme_bw() + ggtitle("Effect of lower.tail parameter in `pt` function") dfR = dfR[dfR['tail'] == 'false'] f, axarr = plt.subplots(2, sharex=True) axarr[0].plot(dfR['x'], dfR['y']) axarr[0].set_title('R CDF') axarr[1].plot(x, ycdf) axarr[1].set_title('Python CDF'); %%R sumCoef <- summary(fit)$coefficients sumCoef %%R sumCoef <- summary(fit)$coefficients sumCoef[1,1] + c(-1, 1) * qt(.975, df = fit$df) * sumCoef[1, 2] %%R sumCoef[2,1] + c(-1, 1) * qt(.975, df = fit$df) * sumCoef[2, 2] coefTable['Estimate']['x'] + pd.Series([-1, 1]) * coefTable['Std. Error']['x'] * st.t.ppf(.975, fit.df_resid) coefTable['Estimate']['(Intercept)'] + pd.Series([-1, 1]) * coefTable['Std. Error']['(Intercept)'] * st.t.ppf(.975, fit.df_resid) %%R require(reshape) require(ggplot2) n <- 3 by <- 0.001 x <- seq(-n, n, by) qt <- qt(x, df = 30) pt <- pt(x, df=30) dt <- dt(x, df=30) rt <- rt(x, df=30) df <- data.frame(x, qt, pt, dt, rt) names(df) <- c('x', 'qt\ndensity', 'pt\ndistribution', 'dt\nquantile', 'rt\nrandom') mdf <- melt.data.frame(df, id = c('x')) g <- ggplot(mdf, aes(x, value, colour=variable)) + geom_point(alpha=0.1) g <- g + facet_wrap(~variable, scale='free_y') + theme_bw() g <- g + ggtitle("R Functions for T-Distribution") + theme(legend.position="none") g <- g + ylab('f(x)') g x = np.arange(-3, 3, .001) dt = st.t.pdf(x, 30) qt = st.t.ppf(x, 30) pt = st.t.cdf(x, 30) f, axarr = plt.subplots(3, sharex=True) axarr[0].plot(x, dt) axarr[0].set_title('Python st.t.pdf = R dt') axarr[1].plot(x, qt) axarr[1].set_title('Python st.t.ppf = R qt') axarr[2].plot(x, pt) axarr[2].set_title('Python st.t.cdf = R pt'); st.t.ppf(.975, 46) st.norm.ppf(.975) %%R qt(.975, df = fit$df) %%R data(diamond) x <- diamond$carat; y <- diamond$price plot(x, y, frame=FALSE,xlab="Carat",ylab="Dollars",pch=21,col="black", bg="lightblue", cex=2) abline(fit, lwd = 2) xVals <- seq(min(x), max(x), by = .01) yVals <- beta0 + beta1 * xVals se1 <- sigma * sqrt(1 / n + (xVals - mean(x))^2/ssx) # orange se2 <- sigma * sqrt(1 + 1 / n + (xVals - mean(x))^2/ssx) # blue lines(xVals, yVals + 2 * se1, col='orange') lines(xVals, yVals - 2 * se1, col='orange') lines(xVals, yVals + 2 * se2, col='blue') lines(xVals, yVals - 2 * se2, col='blue') x = [0.61, 0.93, 0.83, 0.35, 0.54, 0.16, 0.91, 0.62, 0.62] y = [0.67, 0.84, 0.6, 0.18, 0.85, 0.47, 1.1, 0.65, 0.36] df = pd.DataFrame(zip(x, y), columns = ['x', 'y']) from math import sqrt x = df['x'] y = df['y'] n = len(x) beta1 = y.corr(x) * (y.std() / x.std()) beta0 = y.mean() - beta1 * x.mean() ssx = ((x - mean(x))**2).sum() yhat = beta0 + beta1 * x e = y - yhat sigma = sqrt(1.0/(n-2) * (e**2).sum()) se_beta1 = sqrt((sigma**2 / ssx)) p = beta1 / se_beta1 p results = sm.ols('y~x', data=df).fit().summary() results sigma %%R -o mtcars data(mtcars) r = sm.ols('mpg ~ I(wt - wt.mean())', data=mtcars).fit() alpha = 0.05 20.0906 + pd.Series([-1, 1]) * st.t.ppf(1 - alpha/2, 30) * .538 t = r.summary().tables[1] print (t.as_text()) (20.0906 - 18.991) / .538 %%R data(diamond) x <- diamond$carat; y <- diamond$price xVals <- seq(min(x), max(x), by = .01) yVals <- beta0 + beta1 * xVals se1 <- sigma * sqrt(1 / n + (xVals - mean(x))^2/ssx) # orange se2 <- sigma * sqrt(1 + 1 / n + (xVals - mean(x))^2/ssx) # blue x = mtcars['wt'] y = mtcars['mpg'] n = len(x) ssx = (1.0/n) * ((x- x.mean())**2).sum() e = (r.resid - x) sigma = sqrt(1.0/(n-2) * (e**2).sum()) xVals = 3 se = sigma * sqrt(1 + 1 / n + (xVals - mean(x))**2/ssx) se (3 - mtcars.wt.mean()) * r.params[1] + r.params[0] + pd.Series([-1, 1]) * se * st.norm.ppf(.975) sm.ols('mpg ~ I(wt/2)', data=mtcars).fit().summary() r.resid.sum() %%R library(datasets); data(swiss); require(stats); require(graphics) pairs(swiss, panel = panel.smooth, main = "Swiss data", col = 3 + (swiss$Catholic > 50)) %%R -o swiss -w 960 -h 960 require(GGally) library(datasets); data(swiss); theme_set(theme_bw()) swiss$cath_majority = factor(swiss$Catholic >= 50) ggpairs(swiss, lower = list(continuous = "smooth"), colour = "cath_majority" ) _ = pd.scatter_matrix(swiss, figsize=(12,8), diagonal='kde'); try: del swiss['cath_majority'] except KeyError: pass swiss.columns = [_.replace('.', '_') for _ in swiss.columns] y = 'Fertility' x = "+".join(swiss.columns - [y]) formula = '%s ~ %s' % (y, x) results1 = sm.ols(formula, data=swiss).fit() results1.summary() x = 'Agriculture' y = 'Fertility' results = sm.ols('%s ~ %s' % (y, x, ), data=swiss).fit() results.summary() %%R -w 300 -h 300 -o df_simul require(ggplot2) require(GGally) n <- 100; x2 <- 1 : n; x1 <- .01 * x2 + runif(n, -.1, .1); y = -x1 + x2 + rnorm(n, sd = .01) df_simul <- data.frame(x1, x2, y) theme_set(theme_bw()) (ggpairs(df_simul)) r1 = sm.ols('y ~ x1', data=df_simul).fit() r2 = sm.ols('y ~ x1 + x2', data=df_simul).fit() r1.params, r2.params graphics.plot_regress_exog(r1, 'x1'); graphics.plot_regress_exog(r2, "x1"); x1 = df_simul['x1'] x2 = df_simul['x2'] y = df_simul['y'] f, axarr = plt.subplots(2, sharex=True) axarr[0].plot(x1, y) axarr[0].set_title('x1') axarr[1].plot(x2, y) axarr[1].set_title('x2'); r1 = sm.ols('x1 ~ x2', data=df_simul).fit() r2 = sm.ols('y ~ x2', data=df_simul).fit() # Here we factor out the effect of x2 # Not really sure what the point of is this plt.plot(r1.resid, r2.resid, 'o'); %%R -o swiss data(swiss) swiss.columns = [_.replace('.', '_') for _ in swiss.columns] swiss['z'] = swiss['Agriculture'] + swiss['Education'] y = 'Fertility' x = "+".join(swiss.columns - [y]) formula = '%s ~ %s' % (y, x) swiss.head() x r = sm.ols('%s ~ %s' % (y, x), data=swiss).fit(); # wait, this doesn't work. sm.ols doesn't pick up that Z is no good r.summary() %%R # R does pick up that Z is no good. Note that the agriculture value is different # Post question about this data(swiss) z<-swiss$Agriculture+swiss$Education lm(Fertility~.+z,data=swiss) %%R -o InsectSprays require(ggplot) g <- ggplot(InsectSprays, aes(spray, count)) + geom_boxplot() #summary(lm(count ~ spray, data = InsectSprays))$coef print(g) summary(lm(count ~ spray, data = InsectSprays))$coef # InsectSprays['spray'].astype(str) results = sm.ols("count ~ spray", data = InsectSprays).fit() results.summary() ### Hard coding the dummy variables %%R summary(lm(count~ I(1*(spray=='B'))+I(1*(spray=='C'))+ I(1*(spray=='D'))+I(1*(spray=='E'))+ I(1*(spray=='F')) ,data=InsectSprays))$coef formula = "count~ I(1*(spray=='B'))+I(1*(spray=='C'))+I(1*(spray=='D'))+I(1*(spray=='E'))+I(1*(spray=='F'))" formula sm.ols(formula, data=InsectSprays).fit().summary() %%R lm(count~ I(1*(spray=='B'))+I(1*(spray=='C'))+ I(1*(spray=='D'))+I(1*(spray=='E'))+ I(1*(spray=='F'))+I(1*(spray=='A')),data=InsectSprays) formula="count~I(1*(spray=='B'))+I(1*(spray=='C'))+ I(1*(spray=='D'))+I(1*(spray=='E'))+ I(1*(spray=='F'))+I(1*(spray=='A'))" sm.ols(formula, data=InsectSprays).fit().summary() formula = "count~ I(1*(spray=='B'))+I(1*(spray=='C'))+I(1*(spray=='D'))+I(1*(spray=='E'))+I(1*(spray=='F')) -1" sm.ols(formula, data=InsectSprays).fit().summary() InsectSprays.head() IS = InsectSprays grouped = IS.groupby("spray") grouped.mean() %%R -o hunger hunger <- read.csv('data/hunger.csv') hunger <- hunger[hunger$Sex != "Both sexes", ] lm1 <- lm(hunger$Numeric ~ hunger$Year) qqnorm(residuals(lm1)) qqline(residuals(lm1)) hunger.describe() # columns that need to be fixed from the R import of data cols = ['Low', 'High', 'Comments'] for col in cols: hunger[col] = hunger[col].replace(-2147483648 , NaN) hunger.head() y = 'Numeric' x = 'Year' formula = '%s ~ %s' %(y, x,) fit = sm.ols(formula, data=hunger).fit() fit.summary() fig, ax = plt.subplots(figsize=(12, 8)) graphics.plot_regress_exog(fit, "Year", fig); %%R require(ggplot2) g <- ggplot(hunger, aes(Year, Numeric, colour=Sex)) + geom_point(alpha=0.5) g <- g + geom_smooth(method='lm', se=T) g %%R lmM <- lm(hunger$Numeric[hunger$Sex=="Male"] ~ hunger$Year[hunger$Sex=="Male"]) lmF <- lm(hunger$Numeric[hunger$Sex=="Female"] ~ hunger$Year[hunger$Sex=="Female"]) print(summary(lmM)) plot(hunger$Year,hunger$Numeric,pch=19) points(hunger$Year,hunger$Numeric,pch=19,col=((hunger$Sex=="Male")*1+1)) lines(hunger$Year[hunger$Sex=="Male"],lmM$fitted,col="black",lwd=3) lines(hunger$Year[hunger$Sex=="Female"],lmF$fitted,col="red",lwd=3) %%R lmBoth <- lm(hunger$Numeric ~ hunger$Year + hunger$Sex) print(summary(lmBoth)) plot(hunger$Year,hunger$Numeric,pch=19) points(hunger$Year,hunger$Numeric,pch=19,col=((hunger$Sex=="Male")*1+1)) abline(c(lmBoth$coeff[1],lmBoth$coeff[2]),col="red",lwd=3) abline(c(lmBoth$coeff[1] + lmBoth$coeff[3],lmBoth$coeff[2] ),col="black",lwd=3) %%R lmBoth <- lm(hunger$Numeric ~ hunger$Year + hunger$Sex + hunger$Sex*hunger$Year) print(summary(lmBoth)) plot(hunger$Year,hunger$Numeric,pch=19) points(hunger$Year,hunger$Numeric,pch=19,col=((hunger$Sex=="Male")*1+1)) abline(c(lmBoth$coeff[1],lmBoth$coeff[2]),col="red",lwd=3) abline(c(lmBoth$coeff[1] + lmBoth$coeff[3],lmBoth$coeff[2] +lmBoth$coeff[4]),col="black",lwd=3) %%R n <- 100; t <- rep(c(0, 1), c(n/2, n/2)); # treatment effect x <- c(runif(n/2), runif(n/2)); # beta0 <- 0; #intercept beta1 <- 2; #slope tau <- 1; #treatment effect sigma <- .2 # variance y <- beta0 + x * beta1 + t * tau + rnorm(n, sd = sigma) #simulate reg plot(x, y, type = "n", frame = FALSE) abline(lm(y ~ x), lwd = 2) # disregard treatment abline(h = mean(y[1 : (n/2)]), lwd = 3) #mean untreated group abline(h = mean(y[(n/2 + 1) : n]), lwd = 3) # mean treated gropu fit <- lm(y ~ x + t) # correct model with treatment effect # The slope is simply x for both lines. When considering t, then the intersect is \beta0 + t abline(coef(fit)[1], coef(fit)[2], lwd = 3) abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2], lwd = 3) points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 2) points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 2) %%R n <- 100; t <- rep(c(0, 1), c(n/2, n/2)); x <- c(runif(n/2), 1.5 + runif(n/2)); beta0 <- 0; beta1 <- 2; tau <- 0; sigma <- .2 y <- beta0 + x * beta1 + t * tau + rnorm(n, sd = sigma) plot(x, y, type = "n", frame = FALSE) abline(lm(y ~ x), lwd = 2) abline(h = mean(y[1 : (n/2)]), lwd = 3) abline(h = mean(y[(n/2 + 1) : n]), lwd = 3) fit <- lm(y ~ x + t) abline(coef(fit)[1], coef(fit)[2], lwd = 3) abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2], lwd = 3) points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 2) points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 2) from datetime import timedelta YouTubeVideo('BhGQ4YCC8xA', start=int(timedelta(minutes=5, seconds=40).total_seconds()))