from __future__ import division import pandas as pd import numpy as np import scipy as sp import matplotlib.pyplot as plt from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.tree import export_graphviz from sklearn.cross_validation import train_test_split from sklearn.cross_validation import KFold from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.metrics import mean_squared_error from IPython.display import Image import StringIO, pydot %matplotlib inline carseats_df = pd.read_csv("../data/Carseats.csv") carseats_df.head() # Convert other string variables to numeric carseats_df["ShelveLoc"] = pd.factorize(carseats_df["ShelveLoc"])[0] carseats_df["Urban"] = pd.factorize(carseats_df["Urban"])[0] carseats_df["US"] = pd.factorize(carseats_df["US"])[0] # We want to add a binary response variable High (High Sales). We build a histogram to # determine the cut point. plt.hist(carseats_df["Sales"]) # create a binary response variable HighSales and add to dataframe carseats_df["High"] = carseats_df["Sales"].map(lambda x: 0 if x <= 8 else 1) carseats_df.head() # fit a decision tree to predict High from the data. We remove Sales because High is # derived from Sales and is thus correlated to the response collist = [x for x in carseats_df.columns if x not in ('Sales', 'High')] clf = DecisionTreeClassifier() X = carseats_df[collist].values y = carseats_df["High"].values clf.fit(X, y) # Visualize tree dot_data = StringIO.StringIO() export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) Image(graph.create_png()) Xr, Xt, yr, yt = train_test_split(X, y, train_size=250, test_size=150, random_state=42) clf = DecisionTreeClassifier() clf.fit(Xr, yr) ypred = clf.predict(Xt) (confusion_matrix(yt, ypred), accuracy_score(yt, ypred)) kfold = KFold(Xr.shape[0], n_folds=10) accs = [] max_depths = range(1, 20) for max_depth in max_depths: k_accs = [] for train, test in kfold: Xtrain, Xtest, ytrain, ytest = Xr[train], Xr[test], yr[train], yr[test] clf = DecisionTreeClassifier(max_depth=max_depth) clf.fit(Xtrain, ytrain) ypred = clf.predict(Xtest) k_accs.append(accuracy_score(ytest, ypred)) accs.append(np.mean(k_accs)) # plot the accuracies as a function of max_depth plt.plot(max_depths, accs, linewidth=2.5) clf = DecisionTreeClassifier(max_depth=7) clf.fit(Xr, yr) ypred = clf.predict(Xt) confusion_matrix(yt, ypred), accuracy_score(yt, ypred) boston_df = pd.read_csv("../data/Boston.csv") boston_df.head() X = boston_df[boston_df.columns[:-1]].values y = boston_df["medv"].values # Split into training and test sets Xr, Xt, yr, yt = train_test_split(X, y, train_size=300, test_size=202, random_state=42) # Train random forest reg = RandomForestRegressor(n_estimators=500, oob_score=True) reg.fit(Xr, yr) ypred = reg.predict(Xt) mean_squared_error(yt, ypred), reg.oob_score_ oob_scores = [] mses = [] num_feats = range(1, 14) for num_feat in num_feats: reg = RandomForestRegressor(n_estimators=500, max_features=num_feat, oob_score=True) reg.fit(Xr, yr) ypred = reg.predict(Xt) mses.append(mean_squared_error(yt, ypred)) oob_scores.append(reg.oob_score_) plt.plot(num_feats, mses, color='b', linewidth=2.5) plt.plot(num_feats, oob_scores, color='r', linewidth=2.5) plt.xlabel("features") plt.ylabel("MSE (blue); OOB (red)") num_trees = range(100, 2000, 100) mses = [] for num_tree in num_trees: reg = GradientBoostingRegressor(n_estimators=num_tree) reg.fit(Xr, yr) ypred = reg.predict(Xt) mses.append(mean_squared_error(yt, ypred)) plt.plot(num_trees, mses) plt.xlabel("#-trees") plt.ylabel("MSE")