import pandas as pd
import matplotlib.pyplot as plt
import rpy2
from IPython.display import YouTubeVideo
%load_ext rpy2.ipython
%matplotlib inline

heig = pd.DataFrame.from_csv('data/Galton.csv', index_col=None)
heig = pd.melt(heig, value_vars = ['child', 'parent'], value_name = 'height')

%%R -i heig
require(ggplot2)
df <- as.data.frame(heig)
g <- ggplot(heig, aes(height, fill=variable)) + geom_histogram(binwidth=1) + theme_bw()
g <- g + facet_grid(~variable)

g

child = heig[heig['variable'] == 'child']

import numpy as np
child.head()

hrange = np.arange(child['height'].min(), child['height'].max(), .5)

MSE = lambda x: ((child.height - x)**2).mean()

MSEdf = pd.DataFrame([(_, MSE(_)) for _ in hrange], columns = ['mu', 'MSE'])

pd.DataFrame.plot(MSEdf, x = 'mu', y='MSE', title='Minimizing Mu');

child.mean()

# reimport so I don't have to unmelt
heig = pd.DataFrame.from_csv('data/Galton.csv', index_col=None)
# get counts of values for the bubble plot
h = pd.DataFrame({'count' : heig.groupby( [ "child", "parent"] ).size()}).reset_index()
h[:10]

%%R -i h
require(ggplot2)
require(hexbin)
df <- as.data.frame(h)
g <- ggplot(df, aes(parent, child, size=count)) + geom_point() + ggtitle("Bubble Plot of Height")
g + theme_bw()

col = 'child'
h[col] = h[col] - h[col].mean()
col = 'parent'
h[col] = h[col] - h[col].mean()

%%R -i h
require(ggplot2)
require(hexbin)
df <- as.data.frame(h)
g <- ggplot(df, aes(parent, child, size=count)) + geom_point() + ggtitle("Bubble Plot Height De-meaned") + theme_bw()
g2 <- geom_smooth(data=df, aes(parent, child, weight=count), colour='red', method=lm, se=FALSE)
g3 <- geom_smooth(data=df, aes(parent, child), colour='green', method=lm, se=FALSE)
g4 <- geom_smooth(data=df, aes(parent, child), colour='blue', method=loess, se=FALSE)
g + g2 + g3 + g4 + theme(legend.position="none")

col = 'child'
heig[col] = heig[col] - heig[col].mean()
col = 'parent'
heig[col] = heig[col] - heig[col].mean()

MSE = lambda beta: ((heig['child'] - heig['parent'] * beta)**2).mean()

# try slopes between -3 and 3
betas = np.arange(-3, 3, .01)

df = pd.DataFrame([(beta, MSE(beta),) for beta in betas], columns = ['beta', 'MSE'])

df.plot(x = 'beta', y='MSE', title = 'finding the best Beta empirically');

min_beta = float(df[df['MSE']==df['MSE'].min()]['beta'])
df[df['MSE']==df['MSE'].min()]

float(min_beta)

from scipy.stats import linregress

slope, intercept, rvalue, pvalue, stderr = linregress(heig['parent'], heig['child'])

slope

MSE(slope)

import statsmodels.formula.api as sm

result = sm.ols(formula="child ~ parent", data=heig).fit()

heig = pd.DataFrame.from_csv('data/Galton.csv', index_col=None)
heig.head()

y = 'child'
x = 'parent'

beta1 = heig[y].corr(heig[x]) * heig[y].std() / heig[x].std()

beta0 = heig[y].mean() - beta1 * heig[x].mean()
beta0

beta1

result = sm.ols(formula="child ~ parent", data=heig).fit()

result.params

# q1 - what's a better way to do this?
x = [0.18, -1.54, 0.42, 0.95]
w = [2, 1, 3, 1]

df = pd.DataFrame([x, w]).T
df.columns = ['x', 'w']
df

square = lambda x: x**2
f1 = lambda mu: (df['w'] * (df['x'] - mu).apply(square)).sum()

mus = np.arange(0, 1, .01)
f1(.18)

mus = pd.DataFrame([(mu, f1(mu),) for mu in mus], columns = ['mu', 'fx'])

pd.DataFrame.plot(mus, x = 'mu', y='fx', title='Minimizing Mu');

mus[mus['fx']==mus['fx'].min()]

# q2
x = [0.8, 0.47, 0.51, 0.73, 0.36, 0.58, 0.57, 0.85, 0.44, 0.42]
y = [1.39, 0.72, 1.55, 0.48, 1.19, -1.59, 1.23, -0.65, 1.49, 0.05]
df = pd.DataFrame([x, y]).T
df.columns = ['x', 'y']
df.head()

#result = sm.ols(formula="child ~ parent", data=heig).fit()
# y intercept set to zero
result = sm.ols(formula="y ~ x - 1", data=df).fit()

result.summary()

x = 'x'
y = 'y'
num = (df[x] * df[y]).sum()
den = (df[x]).apply(square).sum()

num / den

%%R -i x
require(dataset)
data(mtcars)
head(mtcars)
lm(mpg~wt, data=mtcars)

1.5 *0.4 #???

x = [8.58, 10.46, 9.01, 9.64, 8.86]
s = pd.Series(x)

# normalized values
ns = (s - s.mean())/s.std()
ns

ns.mean()

df

result = sm.ols("y ~ x", data=df).fit()

result.summary()

df['x'].mean()

df

r1 = sm.ols("x~y", data=df).fit()
r2 = sm.ols("y~x", data=df).fit()

r2.params.x / r1.params.y

df['y'].var() / df['x'].var()

%%R -o diamond
require(UsingR)
require(ggplot2)
data(diamond)
g <- ggplot(diamond, aes(carat, price)) + geom_point() + theme_bw()
g <- g + geom_smooth(method=lm, se=FALSE)
g


formula = "price ~ carat"
r1 = sm.ols(formula, data=diamond).fit()

r1.summary()

# same formula, but subtracting out the carat mean to get a usable intercept values
f = "price ~ I(carat - carat.mean())"
r2 = sm.ols(f, data=diamond).fit()

# note that the slope is the same
print(r1.params)
print(r2.params)
print(diamond.price.mean())

diamond.price.mean()

# A one carat increase in diamond is pretty big.
f = "price ~ I(carat *10)"
r3 = sm.ols(f, data=diamond).fit()
r3.params

# And the slope is 1/coef we mutliplied the regressor by

df = pd.DataFrame([0.16, 0.27, 0.34], columns=['carat'])
df['price'] = r3.predict(df)
df

diamond['price_predict'] = r3.predict(diamond)

%%R -i diamond
g <- ggplot(diamond, aes(carat, price)) + geom_point(color='green')
g <- g + geom_line(data=diamond, aes(carat, price_predict), colour='blue', linetype=2) 
g <- g + geom_point(data=diamond, aes(carat, price_predict), colour='red', shape=3) 
g + theme_bw()

# green dots - actual observations
# blue dashed line - predictions
# red dashes - places where we made predictions. Note that these are the same x_i values where we have observed points

%%R -o diamond
data(diamond)
y <- diamond$price
x <- diamond$carat
n <- length(y)
fit <- lm(y~x)
e <- resid(fit)
yhat <- predict(fit)
max(abs(e - (y-yhat)))

diamond
x = diamond.carat
y = diamond.price
results = sm.ols("price ~ carat", data=diamond).fit()

e = results.resid

yhat = results.predict()

df = pd.DataFrame([e, y - yhat]).T
df.columns = ['e', 'y - yhat']

df.head()

(df.e - df['y - yhat']).abs().max()

from statsmodels.api import graphics

fig, ax = plt.subplots(figsize=(12, 8))
fig = graphics.plot_fit(results, "carat", ax=ax)

fig, ax = plt.subplots(figsize=(12, 8))
graphics.plot_regress_exog(results, "carat", fig);

# generated data set
from random import uniform, normalvariate, seed
from math import sin

seed(123)
x = [uniform(-3, 3) for _ in xrange(100)]
y = [_ + sin(_) + normalvariate(mu=100, sigma=.2) for _ in x]
df = pd.DataFrame([_ for _ in zip(x, y)], columns = ['x', 'y'])
df = df.sort('x')

plt.scatter(df.y, df.x);

results = sm.ols("y ~ x", data=df).fit()

# why does this show up twice??
# todo: look at the source and figure 
fig, ax = plt.subplots(figsize=(12, 8))
graphics.plot_regress_exog(results, "x", fig);

plt.plot(df.x, results.resid, 'o');

x = [uniform(0, 6) for _ in xrange(100)]
df = pd.DataFrame(x, columns = ['x'])

ygen = lambda x: x + normalvariate(0, .001*x)
df['y'] = [ygen(x) for x in df.x]

df = df.sort('x')
plt.plot(df.x, df.y, 'o');

r_hetero = sm.ols("y~x", data=df).fit()

    plt.plot(df.x, r_hetero.resid, 'o');

fig, ax = plt.subplots(figsize=(12, 8))
graphics.plot_regress_exog(r_hetero, "x", fig);

diamond.head()

%%R
y <- diamond$price; x <- diamond$carat; n <- length(y)
fit <- lm(y ~ x)
summary(fit)$sigma
sqrt(sum(resid(fit)^2) / (n - 2))

y = 'price'; x = 'carat'
fit = sm.ols('%s ~ %s' % (y, x, ), data=diamond).fit()

e = (fit.predict(diamond) - diamond.price)**2

from math import sqrt
sigma = sqrt((e.shape[0]-2)**-1 * e.sum())
sigma

# Normalized by N-1 by default. This can be changed using the ddof argument
sqrt(fit.resid.var(ddof=2))

from __future__ import print_function
"""
Edward Tufte uses this example from Anscombe to show 4 datasets of x
and y that have the same mean, standard deviation, and regression
line, but which are qualitatively different.

matplotlib fun for a rainy day
"""

from pylab import *

x =  array([10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5])
y1 = array([8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68])
y2 = array([9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74])
y3 = array([7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73])
x4 = array([8,8,8,8,8,8,8,19,8,8,8])
y4 = array([6.58,5.76,7.71,8.84,8.47,7.04,5.25,12.50,5.56,7.91,6.89])

def fit(x):
    return 3+0.5*x


xfit = array( [amin(x), amax(x) ] )

subplot(221)
plot(x,y1,'ks', xfit, fit(xfit), 'r-', lw=2)
axis([2,20,2,14])
setp(gca(), xticklabels=[], yticks=(4,8,12), xticks=(0,10,20))
text(3,12, 'I', fontsize=20)

subplot(222)
plot(x,y2,'ks', xfit, fit(xfit), 'r-', lw=2)
axis([2,20,2,14])
setp(gca(), xticklabels=[], yticks=(4,8,12), yticklabels=[], xticks=(0,10,20))
text(3,12, 'II', fontsize=20)

subplot(223)
plot(x,y3,'ks', xfit, fit(xfit), 'r-', lw=2)
axis([2,20,2,14])
text(3,12, 'III', fontsize=20)
setp(gca(), yticks=(4,8,12), xticks=(0,10,20))

subplot(224)

xfit = array([amin(x4),amax(x4)])
plot(x4,y4,'ks', xfit, fit(xfit), 'r-', lw=2)
axis([2,20,2,14])
setp(gca(), yticklabels=[], yticks=(4,8,12), xticks=(0,10,20))
text(3,12, 'IV', fontsize=20)

#verify the stats
pairs = (x,y1), (x,y2), (x,y3), (x4,y4)
for x,y in pairs:
    print ('mean=%1.2f, std=%1.2f, r=%1.2f'%(mean(y), std(y), corrcoef(x,y)[0][1]))

show()

x =  array([10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5])
y1 = array([8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68])
y2 = array([9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74])
y3 = array([7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73])
x4 = array([8,8,8,8,8,8,8,19,8,8,8])
y4 = array([6.58,5.76,7.71,8.84,8.47,7.04,5.25,12.50,5.56,7.91,6.89])

df = pd.DataFrame([x, x4, y1, y2, y3, y4]).T
df.columns = ['x', 'x4', 'y1', 'y2', 'y3', 'y4']

def regress(x, y, data):
    formula = '%s ~ %s' % (y, x, )
    return sm.ols(formula, data=data).fit()

subplot(221)
plt.plot(df.x, regress('x', 'y1', df).resid, 'ro');
plt.title('I');
subplot(222)
plt.plot(df.x, regress('x', 'y2', df).resid, 'ro');
plt.title('II');
subplot(223)
plt.plot(df.x, regress('x', 'y3', df).resid, 'ro');
plt.title('III');
subplot(224)
plt.plot(df.x, regress('x4', 'y4', df).resid, 'ro');
plt.title('IV');

%%R -o diamond
library(UsingR); data(diamond)
y <- diamond$price; x <- diamond$carat; n <- length(y)
beta1 <- cor(y, x) * sd(y) / sd(x)
beta0 <- mean(y) - beta1 * mean(x)
e <- y - beta0 - beta1 * x
sigma <- sqrt(sum(e^2) / (n-2)) 
ssx <- sum((x - mean(x))^2)
seBeta0 <- (1 / n + mean(x) ^ 2 / ssx) ^ .5 * sigma 
seBeta1 <- sigma / sqrt(ssx)
tBeta0 <- beta0 / seBeta0; 
tBeta1 <- beta1 / seBeta1
pBeta0 <- 2 * pt(abs(tBeta0), df = n - 2, lower.tail = FALSE)
pBeta1 <- 2 * pt(abs(tBeta1), df = n - 2, lower.tail = FALSE)
coefTable <- rbind(c(beta0, seBeta0, tBeta0, pBeta0), c(beta1, seBeta1, tBeta1, pBeta1))
colnames(coefTable) <- c("Estimate", "Std. Error", "t value", "P(>|t|)")
rownames(coefTable) <- c("(Intercept)", "x")
coefTable

import scipy.stats as st
n = 46
st.t.pdf(abs(1), df = n - 2)

x = diamond['carat']
y = diamond['price']
n = float(len(y))
beta1 = y.corr(x) * y.std() / x.std() # regression slope
beta0 = y.mean() - beta1 * x.mean()   # regression intercept
e = y - (beta1 * x + beta0) # errors
sigma = ((e**2).sum()/(n - 2))**0.5 # unbiased residual mean 
ssx = ((x - x.mean())**2).sum() # sum of squared Xs
seBeta0 = (1/n + x.mean()**2 / ssx)**0.5 * sigma
seBeta1 = sigma / ssx**0.5
tBeta0 = beta0 / seBeta0
tBeta1 = beta1 / seBeta1
# TODO: review hypothesis testing
pBeta0 = 2 * (1-st.t.cdf(abs(tBeta0), df = n - 2))
pBeta1 = 2 * (1-st.t.cdf(abs(tBeta1), df = n - 2))
l = [[beta0, seBeta0, tBeta0, pBeta0], 
     [beta1, seBeta1, tBeta1, pBeta1]]
coefTable = pd.DataFrame((_ for _ in l), 
                         columns = ["Estimate", "Std. Error", "t value", "P(>|t|)"],
                         index = ["(Intercept)", "x"])

coefTable

fit = sm.ols("price ~ carat", data=diamond).fit()
fit.summary()

%%R
data(diamond)
fit <- lm("price ~ carat", data=diamond)
summary(fit)

print ('\n'.join(st.t.pdf.__doc__.split('\n')[:100]))

import numpy as np
x = np.arange(-3, 3, .01)
ypdf = [st.t.pdf(_, n - 2) for _ in x]

ycdf = [1 - st.t.cdf(_, n - 2) for _ in x]

f, axarr = plt.subplots(2, sharex=True)
axarr[0].plot(x, ypdf)
axarr[0].set_title('PDF')
axarr[1].plot(x, ycdf)
axarr[1].set_title('CDF');

%%R -o dfR
# test to figure out wtf the pt function is doing
require(ggplot2)
n <- 48
x <- seq(-3, 3, by=.01)
y <- pt(x, df = n - 2, lower.tail = FALSE)
df1 <- data.frame(x, y, 'false')
names(df1) <- c('x', 'y', 'tail')

y <- pt(x, df = n - 2, lower.tail = TRUE)
df2 <- data.frame(x, y, 'true')
names(df2) <- c('x', 'y', 'tail')
dfR <- rbind(df1, df2)
ggplot(dfR, aes(x, y, colour=tail)) + geom_line() + theme_bw() + ggtitle("Effect of lower.tail parameter in `pt` function")

dfR = dfR[dfR['tail'] == 'false']

f, axarr = plt.subplots(2, sharex=True)
axarr[0].plot(dfR['x'], dfR['y'])
axarr[0].set_title('R CDF')
axarr[1].plot(x, ycdf)
axarr[1].set_title('Python CDF');

%%R
sumCoef <- summary(fit)$coefficients
sumCoef

%%R
sumCoef <- summary(fit)$coefficients
sumCoef[1,1] + c(-1, 1) * qt(.975, df = fit$df) * sumCoef[1, 2]

%%R
sumCoef[2,1] + c(-1, 1) * qt(.975, df = fit$df) * sumCoef[2, 2]

coefTable['Estimate']['x'] + pd.Series([-1, 1]) * coefTable['Std. Error']['x'] * st.t.ppf(.975, fit.df_resid)

coefTable['Estimate']['(Intercept)'] + pd.Series([-1, 1]) * coefTable['Std. Error']['(Intercept)'] * st.t.ppf(.975, fit.df_resid)

%%R
require(reshape)
require(ggplot2)
n <- 3
by <- 0.001
x <- seq(-n, n, by)
qt <- qt(x, df = 30)
pt <- pt(x, df=30)
dt <- dt(x, df=30)
rt <- rt(x, df=30)

df <- data.frame(x, qt, pt, dt, rt)
names(df) <- c('x', 'qt\ndensity', 'pt\ndistribution', 'dt\nquantile', 'rt\nrandom')
mdf <- melt.data.frame(df, id = c('x'))
g <- ggplot(mdf, aes(x, value, colour=variable)) + geom_point(alpha=0.1)
g <- g + facet_wrap(~variable, scale='free_y') + theme_bw()
g <- g + ggtitle("R Functions for T-Distribution") + theme(legend.position="none")
g <- g + ylab('f(x)')
g

x = np.arange(-3, 3, .001)
dt = st.t.pdf(x, 30)
qt = st.t.ppf(x, 30)
pt = st.t.cdf(x, 30)
f, axarr = plt.subplots(3, sharex=True)
axarr[0].plot(x, dt)
axarr[0].set_title('Python st.t.pdf = R dt')
axarr[1].plot(x, qt)
axarr[1].set_title('Python st.t.ppf = R qt')
axarr[2].plot(x, pt)
axarr[2].set_title('Python st.t.cdf = R pt');

st.t.ppf(.975, 46)

st.norm.ppf(.975)

%%R
qt(.975, df = fit$df)

%%R
data(diamond)
x <- diamond$carat; y <- diamond$price
plot(x, y, frame=FALSE,xlab="Carat",ylab="Dollars",pch=21,col="black", bg="lightblue", cex=2)
abline(fit, lwd = 2)
xVals <- seq(min(x), max(x), by = .01)
yVals <- beta0 + beta1 * xVals
se1 <- sigma * sqrt(1 / n + (xVals - mean(x))^2/ssx)      # orange
se2 <- sigma * sqrt(1 + 1 / n + (xVals - mean(x))^2/ssx)  # blue
lines(xVals, yVals + 2 * se1, col='orange')
lines(xVals, yVals - 2 * se1, col='orange')
lines(xVals, yVals + 2 * se2, col='blue')
lines(xVals, yVals - 2 * se2, col='blue')

x = [0.61, 0.93, 0.83, 0.35, 0.54, 0.16, 0.91, 0.62, 0.62]
y = [0.67, 0.84, 0.6, 0.18, 0.85, 0.47, 1.1, 0.65, 0.36]
df = pd.DataFrame(zip(x, y), columns = ['x', 'y'])

from math import sqrt
x = df['x']
y = df['y']
n = len(x)
beta1 = y.corr(x) * (y.std() / x.std())
beta0 = y.mean() - beta1 * x.mean()
ssx = ((x - mean(x))**2).sum()
yhat = beta0 + beta1 * x
e = y - yhat
sigma = sqrt(1.0/(n-2) * (e**2).sum())
se_beta1 = sqrt((sigma**2 / ssx))
p = beta1 / se_beta1
p

results = sm.ols('y~x', data=df).fit().summary()
results

sigma

%%R -o mtcars
data(mtcars)

r = sm.ols('mpg ~ I(wt - wt.mean())', data=mtcars).fit()

alpha = 0.05
20.0906 + pd.Series([-1, 1]) * st.t.ppf(1 - alpha/2, 30) * .538

t = r.summary().tables[1]
print (t.as_text())

(20.0906 - 18.991) / .538

%%R
data(diamond)
x <- diamond$carat; y <- diamond$price
xVals <- seq(min(x), max(x), by = .01)
yVals <- beta0 + beta1 * xVals
se1 <- sigma * sqrt(1 / n + (xVals - mean(x))^2/ssx)      # orange
se2 <- sigma * sqrt(1 + 1 / n + (xVals - mean(x))^2/ssx)  # blue

x = mtcars['wt']
y = mtcars['mpg']

n = len(x)
ssx = (1.0/n) * ((x- x.mean())**2).sum()
e = (r.resid - x)
sigma = sqrt(1.0/(n-2) * (e**2).sum())
xVals = 3
se = sigma * sqrt(1 + 1 / n + (xVals - mean(x))**2/ssx)
se

(3 - mtcars.wt.mean()) * r.params[1] + r.params[0] + pd.Series([-1, 1]) * se * st.norm.ppf(.975)

sm.ols('mpg ~ I(wt/2)', data=mtcars).fit().summary()

r.resid.sum()

%%R
library(datasets); data(swiss); require(stats); require(graphics)
pairs(swiss, panel = panel.smooth, main = "Swiss data", col = 3 + (swiss$Catholic > 50))

%%R -o swiss -w 960 -h 960
require(GGally)
library(datasets); 
data(swiss); 
theme_set(theme_bw())
swiss$cath_majority = factor(swiss$Catholic >= 50)
ggpairs(swiss, 
        lower = list(continuous = "smooth"),
        colour = "cath_majority"
)

_ = pd.scatter_matrix(swiss, figsize=(12,8), diagonal='kde');

try:
    del swiss['cath_majority']
except KeyError:
    pass
swiss.columns = [_.replace('.', '_') for _ in swiss.columns]
y = 'Fertility'
x = "+".join(swiss.columns - [y])
formula = '%s ~ %s' % (y, x)

results1 = sm.ols(formula, data=swiss).fit()
results1.summary()

x = 'Agriculture'
y = 'Fertility'
results = sm.ols('%s ~ %s' % (y, x, ), data=swiss).fit()
results.summary()

%%R -w 300 -h 300 -o df_simul
require(ggplot2)
require(GGally)
n <- 100; 
x2 <- 1 : n; 
x1 <- .01 * x2 + runif(n, -.1, .1); 
y = -x1 + x2 + rnorm(n, sd = .01)
df_simul <- data.frame(x1, x2, y)
theme_set(theme_bw())
(ggpairs(df_simul))

r1 = sm.ols('y ~ x1', data=df_simul).fit()
r2 = sm.ols('y ~ x1 + x2', data=df_simul).fit()

r1.params, r2.params

graphics.plot_regress_exog(r1, 'x1');

graphics.plot_regress_exog(r2, "x1");

x1 = df_simul['x1']
x2 = df_simul['x2']
y = df_simul['y']

f, axarr = plt.subplots(2, sharex=True)
axarr[0].plot(x1, y)
axarr[0].set_title('x1')
axarr[1].plot(x2, y)
axarr[1].set_title('x2');

r1 = sm.ols('x1 ~ x2', data=df_simul).fit()
r2 = sm.ols('y ~ x2', data=df_simul).fit()

# Here we factor out the effect of x2
# Not really sure what the point of is this
plt.plot(r1.resid, r2.resid, 'o');

%%R -o swiss
data(swiss)

swiss.columns = [_.replace('.', '_') for _ in swiss.columns]
swiss['z'] = swiss['Agriculture'] + swiss['Education']
y = 'Fertility'
x = "+".join(swiss.columns - [y])
formula = '%s ~ %s' % (y, x)
swiss.head()
x


r = sm.ols('%s ~ %s' % (y, x), data=swiss).fit();

# wait, this doesn't work. sm.ols doesn't pick up that Z is no good
r.summary()

%%R
# R does pick up that Z is no good. Note that the agriculture value is different
# Post question about this
data(swiss)
z<-swiss$Agriculture+swiss$Education
lm(Fertility~.+z,data=swiss)

%%R -o InsectSprays
require(ggplot)
g <- ggplot(InsectSprays, aes(spray, count)) + geom_boxplot()
#summary(lm(count ~ spray, data = InsectSprays))$coef
print(g)
summary(lm(count ~ spray, data = InsectSprays))$coef

# InsectSprays['spray'].astype(str)
results = sm.ols("count ~ spray", data = InsectSprays).fit()
results.summary()

### Hard coding the dummy variables

%%R
summary(lm(count~ I(1*(spray=='B'))+I(1*(spray=='C'))+
I(1*(spray=='D'))+I(1*(spray=='E'))+
I(1*(spray=='F')) ,data=InsectSprays))$coef

formula = "count~ I(1*(spray=='B'))+I(1*(spray=='C'))+I(1*(spray=='D'))+I(1*(spray=='E'))+I(1*(spray=='F'))"

formula

sm.ols(formula, data=InsectSprays).fit().summary()

%%R
lm(count~
I(1*(spray=='B'))+I(1*(spray=='C'))+ I(1*(spray=='D'))+I(1*(spray=='E'))+ I(1*(spray=='F'))+I(1*(spray=='A')),data=InsectSprays)

formula="count~I(1*(spray=='B'))+I(1*(spray=='C'))+ I(1*(spray=='D'))+I(1*(spray=='E'))+ I(1*(spray=='F'))+I(1*(spray=='A'))"
sm.ols(formula, data=InsectSprays).fit().summary()

formula = "count~ I(1*(spray=='B'))+I(1*(spray=='C'))+I(1*(spray=='D'))+I(1*(spray=='E'))+I(1*(spray=='F')) -1"

sm.ols(formula, data=InsectSprays).fit().summary()

InsectSprays.head()

IS = InsectSprays
grouped = IS.groupby("spray")

grouped.mean()

%%R -o hunger
hunger <- read.csv('data/hunger.csv')
hunger <- hunger[hunger$Sex != "Both sexes", ]
lm1 <- lm(hunger$Numeric ~ hunger$Year)
qqnorm(residuals(lm1))
qqline(residuals(lm1))

hunger.describe()

# columns that need to be fixed from the R import of data
cols = ['Low', 'High', 'Comments']

for col in cols:
    hunger[col] = hunger[col].replace(-2147483648 , NaN)
hunger.head()

y = 'Numeric'
x = 'Year'
formula = '%s ~ %s' %(y, x,)
fit = sm.ols(formula, data=hunger).fit() 

fit.summary()

fig, ax = plt.subplots(figsize=(12, 8))
graphics.plot_regress_exog(fit, "Year", fig);

%%R
require(ggplot2)
g <- ggplot(hunger, aes(Year, Numeric, colour=Sex)) + geom_point(alpha=0.5)
g <- g + geom_smooth(method='lm', se=T)
g

%%R
lmM <- lm(hunger$Numeric[hunger$Sex=="Male"] ~ hunger$Year[hunger$Sex=="Male"])
lmF <- lm(hunger$Numeric[hunger$Sex=="Female"] ~ hunger$Year[hunger$Sex=="Female"])
print(summary(lmM))
plot(hunger$Year,hunger$Numeric,pch=19)
points(hunger$Year,hunger$Numeric,pch=19,col=((hunger$Sex=="Male")*1+1))
lines(hunger$Year[hunger$Sex=="Male"],lmM$fitted,col="black",lwd=3)
lines(hunger$Year[hunger$Sex=="Female"],lmF$fitted,col="red",lwd=3)

%%R
lmBoth <- lm(hunger$Numeric ~ hunger$Year + hunger$Sex)
print(summary(lmBoth))
plot(hunger$Year,hunger$Numeric,pch=19)
points(hunger$Year,hunger$Numeric,pch=19,col=((hunger$Sex=="Male")*1+1))
abline(c(lmBoth$coeff[1],lmBoth$coeff[2]),col="red",lwd=3)
abline(c(lmBoth$coeff[1] + lmBoth$coeff[3],lmBoth$coeff[2] ),col="black",lwd=3)

%%R
lmBoth <- lm(hunger$Numeric ~ hunger$Year + hunger$Sex + hunger$Sex*hunger$Year)
print(summary(lmBoth))
plot(hunger$Year,hunger$Numeric,pch=19)
points(hunger$Year,hunger$Numeric,pch=19,col=((hunger$Sex=="Male")*1+1))
abline(c(lmBoth$coeff[1],lmBoth$coeff[2]),col="red",lwd=3)
abline(c(lmBoth$coeff[1] + lmBoth$coeff[3],lmBoth$coeff[2] +lmBoth$coeff[4]),col="black",lwd=3)

%%R
n <- 100;
t <- rep(c(0, 1), c(n/2, n/2)); # treatment effect
x <- c(runif(n/2), runif(n/2)); # 

beta0 <- 0; #intercept
beta1 <- 2; #slope
tau <- 1;   #treatment effect
sigma <- .2 # variance

y <- beta0 + x * beta1 + t * tau + rnorm(n, sd = sigma) #simulate reg
plot(x, y, type = "n", frame = FALSE)
abline(lm(y ~ x), lwd = 2) # disregard treatment
abline(h = mean(y[1 : (n/2)]), lwd = 3) #mean untreated group
abline(h = mean(y[(n/2 + 1) : n]), lwd = 3) #  mean treated gropu
fit <- lm(y ~ x + t) # correct model with treatment effect

# The slope is simply x for both lines. When considering t, then the intersect is \beta0 + t

abline(coef(fit)[1], coef(fit)[2], lwd = 3)
abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2], lwd = 3)
points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 2)
points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 2)

%%R

n <- 100; t <- rep(c(0, 1), c(n/2, n/2)); x <- c(runif(n/2), 1.5 + runif(n/2));
beta0 <- 0; beta1 <- 2; tau <- 0; sigma <- .2
y <- beta0 + x * beta1 + t * tau + rnorm(n, sd = sigma)
plot(x, y, type = "n", frame = FALSE)
abline(lm(y ~ x), lwd = 2)
abline(h = mean(y[1 : (n/2)]), lwd = 3)
abline(h = mean(y[(n/2 + 1) : n]), lwd = 3)
fit <- lm(y ~ x + t)
abline(coef(fit)[1], coef(fit)[2], lwd = 3)
abline(coef(fit)[1] + coef(fit)[3], coef(fit)[2], lwd = 3)
points(x[1 : (n/2)], y[1 : (n/2)], pch = 21, col = "black", bg = "lightblue", cex = 2)
points(x[(n/2 + 1) : n], y[(n/2 + 1) : n], pch = 21, col = "black", bg = "salmon", cex = 2)

from datetime import timedelta
YouTubeVideo('BhGQ4YCC8xA', start=int(timedelta(minutes=5, seconds=40).total_seconds()))