options(repr.plot.width=4, repr.plot.height=4) bacteria.table = read.table('http://stats191.stanford.edu/data/bacteria.table', header=T) plot(bacteria.table$t, bacteria.table$N_t, pch=23, cex=2, bg='orange') bacteria.lm = lm(N_t ~ t, bacteria.table) abline(bacteria.lm$coef, lwd=2, col='red') par(mfrow=c(2,2)) plot(bacteria.lm, pch=23, bg='orange') bacteria.log.lm <- lm(log(N_t) ~ t, bacteria.table) plot(bacteria.table$t, bacteria.table$N_t, pch=23, cex=2, bg='orange') lines(bacteria.table$t, fitted(bacteria.lm), lwd=2, col='red') lines(bacteria.table$t, exp(fitted(bacteria.log.lm)), lwd=2, col='green') par(mfrow=c(2,2)) plot(bacteria.log.lm, pch=23, bg='orange') education.table = read.table('http://stats191.stanford.edu/data/education1975.table', header=T) education.table$Region = factor(education.table$Region) education.lm = lm(Y ~ X1 + X2 + X3, data=education.table) summary(education.lm) par(mfrow=c(2,2)) plot(education.lm) boxplot(rstandard(education.lm) ~ education.table$Region, col=c('red', 'green', 'blue', 'yellow')) keep.subset = (education.table$STATE != 'AK') education.noAK.lm = lm(Y ~ X1 + X2 + X3, subset=keep.subset, data=education.table) summary(education.noAK.lm) par(mfrow=c(2,2)) plot(education.noAK.lm) par(mfrow=c(1,1)) boxplot(rstandard(education.noAK.lm) ~ education.table$Region[keep.subset], col=c('red', 'green', 'blue', 'yellow')) educ.weights = 0 * education.table$Y for (region in levels(education.table$Region)) { subset.region = (education.table$Region[keep.subset] == region) educ.weights[subset.region] <- 1.0 / (sum(resid(education.noAK.lm) [subset.region]^2) / sum(subset.region)) } unique(educ.weights) education.noAK.weight.lm <- lm(Y ~ X1 + X2 + X3, weights=educ.weights, subset=keep.subset, data=education.table) summary(education.noAK.weight.lm) summary(education.noAK.lm) par(mfrow=c(2,2)) plot(education.noAK.weight.lm) par(mfrow=c(1,1)) boxplot(resid(education.noAK.weight.lm, type='pearson') ~ education.table$Region[keep.subset], col=c('red', 'green', 'blue', 'yellow')) ntrial = 10000 # how many trials will we be doing? nsample = 20 # how many points in each trial sd = c(1:20) # how does the variance change mu = 2.0 get.sample <- function() { return(rnorm(nsample)*sd + mu) } unweighted.estimate <- function(cur.sample) { return(mean(cur.sample)) } unweighted.estimate <- numeric(ntrial) weighted.estimate <- numeric(ntrial) suboptimal.estimate <- numeric(ntrial) ntrial = 1000 for (i in 1:ntrial) { cur.sample = get.sample() unweighted.estimate[i] = mean(cur.sample) weighted.estimate[i] = sum(cur.sample/sd^2) / sum(1/sd^2) suboptimal.estimate[i] = sum(cur.sample/sd) / sum(1/sd) } data.frame(mean(unweighted.estimate), sd(unweighted.estimate)) data.frame(mean(weighted.estimate), sd(weighted.estimate)) data.frame(mean(suboptimal.estimate), sd(suboptimal.estimate)) densY = c(density(unweighted.estimate)$y, density(weighted.estimate)$y, density(suboptimal.estimate)$y) densX = c(density(unweighted.estimate)$x, density(weighted.estimate)$x, density(suboptimal.estimate)$x) options(repr.plot.width=5, repr.plot.height=5) plot(densX, densY, type='n', main='Comparison of densities of the estimators', cex=0.8) lines(density(weighted.estimate), col='red', lwd=4) lines(density(unweighted.estimate), col='blue', lwd=4) lines(density(suboptimal.estimate), col='purple', lwd=4) legend(6,0.3, c('optimal', 'suboptimal', 'mean'), col=c('red', 'purple', 'blue'), lwd=rep(4,3), cex=0.8)