options(repr.plot.width=5, repr.plot.height=3) set.seed(0) library(alr3) data(heights) M = heights$Mheight D = heights$Dheight library(ggplot2) heights_fig = ggplot(heights, aes(Mheight, Dheight)) + geom_point() + theme_bw(); heights_fig X = 66 rect = data.frame(xmin=X-.5, xmax=X+.5, ymin=-Inf, ymax=Inf) heights_fig + geom_rect(data=rect, aes(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax), color="grey20", alpha=0.5, inherit.aes = FALSE) selected_points = (M <= X+.5) & (M >= X-.5) mean_within_slice = mean(D[selected_points]) mean_within_slice X = 60 selected_points = (M <= X+.5) & (M >= X-.5) mean_within_slice = mean(D[selected_points]) mean_within_slice X = 60 rect = data.frame(xmin=X-.5, xmax=X+.5, ymin=-Inf, ymax=Inf) heights_fig + geom_rect(data=rect, aes(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax), color="grey20", alpha=0.5, inherit.aes = FALSE) parameters = lm(D ~ M)$coef print(parameters) intercept = parameters[1] slope = parameters[2] heights_fig + geom_abline(slope=slope, intercept=intercept, color='red', size=3) X = rnorm(50) Y = 1.5 + 0.1 * X + rnorm(50) * 2 parameters = lm(Y ~ X)$coef intercept = parameters[1] slope = parameters[2] ggplot(data.frame(X, Y), aes(X, Y)) + geom_point() + geom_abline(slope=slope, intercept=intercept) grid_intercept = seq(intercept - 2, intercept + 2, length=100) grid_slope = seq(slope - 2, slope + 2, length=100) loss_data = expand.grid(intercept_=grid_intercept, slope_=grid_slope) loss_data$squared_error = numeric(nrow(loss_data)) for (i in 1:nrow(loss_data)) { loss_data$squared_error[i] = sum((Y - X * loss_data$slope_[i] - loss_data$intercept_[i])^2) } squared_error_fig = (ggplot(loss_data, aes(intercept_, slope_, fill=squared_error)) + geom_raster() + scale_fill_gradientn(colours=c("gray","yellow","blue"))) squared_error_fig loss_data$absolute_error = numeric(nrow(loss_data)) for (i in 1:nrow(loss_data)) { loss_data$absolute_error[i] = sum(abs(Y - X * loss_data$slope_[i] - loss_data$intercept_[i])) } absolute_error_fig = (ggplot(loss_data, aes(intercept_, slope_, fill=absolute_error)) + geom_raster() + scale_fill_gradientn(colours=c("gray","yellow","blue"))) absolute_error_fig url = 'http://www.stanford.edu/class/stats191/data/wage.csv' wages = read.table(url, sep=',', header=TRUE) print(head(wages)) attach(wages) mean(logwage) wages.lm = lm(logwage ~ education) print(wages.lm) logwage_fig = (ggplot(wages, aes(education, logwage)) + geom_point() + theme_bw() + geom_abline(slope=wages.lm$coef[2], intercept=wages.lm$coef[1], color='red', size=3)) logwage_fig beta.1.hat = cov(education, logwage) / var(education) beta.0.hat = mean(logwage) - beta.1.hat * mean(education) print(c(beta.0.hat, beta.1.hat)) print(coef(wages.lm)) sigma.hat = sqrt(sum(resid(wages.lm)^2) / wages.lm$df.resid) c(sigma.hat, sqrt(sum((logwage - predict(wages.lm))^2) / wages.lm$df.resid)) summary(wages.lm) rejection_region = function(dens, q_lower, q_upper, xval) { fig = (ggplot(data.frame(x=xval), aes(x)) + stat_function(fun=dens, geom='line') + stat_function(fun=function(x) {ifelse(x > q_upper | x < q_lower, dens(x), NA)}, geom='area', fill='#CC7777') + labs(y='Density', x='T') + theme_bw()) } xval = seq(-4,4,length=101) q = qnorm(0.975) Z_fig = rejection_region(dnorm, -q, q, xval) + annotate('text', x=2.5, y=dnorm(2)+0.3, label='Z statistic', color='#CC7777') Z_fig q10 = qt(0.975, 10) T_fig = (Z_fig + stat_function(fun=function(x) {ifelse(x > q10 | x < -q10, dt(x, 10), NA)}, geom='area', fill='#7777CC', alpha=0.5) + stat_function(fun=function(x) {dt(x, 10)}, color='blue') + annotate('text', x=2.5, y=dnorm(2)+0.27, label='T statistic, df=10', color='#7777CC') ); T_fig SE.beta.1.hat = (sigma.hat * sqrt(1 / sum((education - mean(education))^2))) Tstat = (beta.1.hat - 0) / SE.beta.1.hat data.frame(beta.1.hat, SE.beta.1.hat, Tstat) summary(wages.lm) 2*(1 - pt(Tstat, wages.lm$df.resid)) detach(wages) alpha = 0.05 n = length(M) qt(1-0.5*alpha, n-2) qnorm(1 - 0.5*alpha) L = beta.1.hat - qt(0.975, wages.lm$df.resid) * SE.beta.1.hat U = beta.1.hat + qt(0.975, wages.lm$df.resid) * SE.beta.1.hat data.frame(L, U) confint(wages.lm) height.lm = lm(D~M) predict(height.lm, list(M=c(66, 60)), interval='confidence', level=0.90) heights_fig X = 66 selected_points = (M <= X+.5) & (M >= X-.5) center = mean(D[selected_points]) sd_ = sd(D[selected_points]) L = center - qnorm(0.95) * sd_ U = center + qnorm(0.95) * sd_ data.frame(center, L, U) predict(height.lm, list(M=66), interval='prediction', level=0.90) (69.41-61.94) sigma.hat.height = sqrt(sum(resid(height.lm)^2) / height.lm$df.resid) 2 * qnorm(0.95) * sigma.hat.height