from __future__ import division
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import export_graphviz
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from IPython.display import Image
import StringIO, pydot
%matplotlib inline

carseats_df = pd.read_csv("../data/Carseats.csv")
carseats_df.head()

# Convert other string variables to numeric
carseats_df["ShelveLoc"] = pd.factorize(carseats_df["ShelveLoc"])[0]
carseats_df["Urban"] = pd.factorize(carseats_df["Urban"])[0]
carseats_df["US"] = pd.factorize(carseats_df["US"])[0]
# We want to add a binary response variable High (High Sales). We build a histogram to 
# determine the cut point.
plt.hist(carseats_df["Sales"])

# create a binary response variable HighSales and add to dataframe
carseats_df["High"] = carseats_df["Sales"].map(lambda x: 0 if x <= 8 else 1)
carseats_df.head()

# fit a decision tree to predict High from the data. We remove Sales because High is
# derived from Sales and is thus correlated to the response
collist = [x for x in carseats_df.columns if x not in ('Sales', 'High')]
clf = DecisionTreeClassifier()
X = carseats_df[collist].values
y = carseats_df["High"].values
clf.fit(X, y)

# Visualize tree
dot_data = StringIO.StringIO()
export_graphviz(clf, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

Xr, Xt, yr, yt = train_test_split(X, y, train_size=250, test_size=150, random_state=42)
clf = DecisionTreeClassifier()
clf.fit(Xr, yr)
ypred = clf.predict(Xt)
(confusion_matrix(yt, ypred), accuracy_score(yt, ypred))

kfold = KFold(Xr.shape[0], n_folds=10)
accs = []
max_depths = range(1, 20)
for max_depth in max_depths:
    k_accs = []
    for train, test in kfold:
        Xtrain, Xtest, ytrain, ytest = Xr[train], Xr[test], yr[train], yr[test]
        clf = DecisionTreeClassifier(max_depth=max_depth)
        clf.fit(Xtrain, ytrain)
        ypred = clf.predict(Xtest)
        k_accs.append(accuracy_score(ytest, ypred))
    accs.append(np.mean(k_accs))
# plot the accuracies as a function of max_depth
plt.plot(max_depths, accs, linewidth=2.5)

clf = DecisionTreeClassifier(max_depth=7)
clf.fit(Xr, yr)
ypred = clf.predict(Xt)
confusion_matrix(yt, ypred), accuracy_score(yt, ypred)

boston_df = pd.read_csv("../data/Boston.csv")
boston_df.head()

X = boston_df[boston_df.columns[:-1]].values
y = boston_df["medv"].values

# Split into training and test sets
Xr, Xt, yr, yt = train_test_split(X, y, train_size=300, test_size=202, random_state=42)

# Train random forest
reg = RandomForestRegressor(n_estimators=500, oob_score=True)
reg.fit(Xr, yr)
ypred = reg.predict(Xt)
mean_squared_error(yt, ypred), reg.oob_score_

oob_scores = []
mses = []
num_feats = range(1, 14)
for num_feat in num_feats:
    reg = RandomForestRegressor(n_estimators=500, max_features=num_feat, oob_score=True)
    reg.fit(Xr, yr)
    ypred = reg.predict(Xt)
    mses.append(mean_squared_error(yt, ypred))
    oob_scores.append(reg.oob_score_)
plt.plot(num_feats, mses, color='b', linewidth=2.5)
plt.plot(num_feats, oob_scores, color='r', linewidth=2.5)
plt.xlabel("features")
plt.ylabel("MSE (blue); OOB (red)")

num_trees = range(100, 2000, 100)
mses = []
for num_tree in num_trees:
    reg = GradientBoostingRegressor(n_estimators=num_tree)
    reg.fit(Xr, yr)
    ypred = reg.predict(Xt)
    mses.append(mean_squared_error(yt, ypred))
plt.plot(num_trees, mses)
plt.xlabel("#-trees")
plt.ylabel("MSE")