#!/usr/bin/env python # coding: utf-8 # Answer all questions and submit them either as an IPython notebook, LaTeX document, or Markdown document. Provide full answers for each question, including interpretation of the results. Each question is worth 25 points. # # This homework is due on Monday, December 8, 2016. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np # import pymc as pm import pylab as plt import pandas as pd # Set seed np.random.seed(42) # ## Question 1 # # The `titanic.xls` spreadsheet in the `data` directory contains data regarding the passengers on the Titanic when it sank in 1912. A recent [Kaggle competition](http://www.kaggle.com/c/titanic-gettingStarted) was based on predicting survival for passengers based on the attributes in the passenger list. # # Use scikit-learn to build both a support vector classifier and a logistic regression model to predict survival on the Titanic. Use cross-validation to assess your models, and try to tune them to improve performance. # # Discuss the benefits and drawbacks of both approaches for application to such problems. # In[2]: titanic = pd.read_excel('/Users/fonnescj/Teaching/Bios8366/data/titanic.xls', 'titanic') titanic.head() # In[3]: titanic_data = titanic.copy()[['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']] titanic_data['female'] = titanic_data.sex.replace(to_replace= ['male', 'female'], value= [0, 1], regex= False) titanic_data = titanic_data.drop('sex', axis=1) # In[4]: titanic_data.embarked.value_counts() # In[5]: titanic_data.embarked = titanic_data.embarked.replace({'S':0, 'C':1, 'Q':2}) # In[6]: titanic_data = titanic_data.dropna(subset=['embarked']) # In[7]: from sklearn import preprocessing lb = preprocessing.LabelBinarizer() lb.fit(list(titanic_data.embarked.dropna())) embarked_classes = lb.transform(titanic_data.embarked.values) titanic_data['C'] = embarked_classes.T[1] titanic_data['Q'] = embarked_classes.T[2] titanic_data = titanic_data.drop('embarked', axis=1) # In[8]: X = titanic_data.dropna() y = X.pop('survived') # In[15]: from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import normalize X_normalized = normalize(X.values.astype(float)) X_train, X_test, y_train, y_test = train_test_split(X_normalized, y) # In[16]: from sklearn import svm svc1 = svm.SVC(kernel='linear') svc1.fit(X_train, y_train) svc1.score(X_train, y_train) # In[17]: svc1.score(X_test, y_test) # In[18]: parameters = {'kernel':('linear', 'rbf', 'poly'), 'C':[0.1, 0.5, 1, 5, 10]} np.random.seed(3535) gscv1 = GridSearchCV(svm.SVC(), parameters, cv= 5) # default is 3-fold gscv1.fit(X_normalized, y) # In[19]: gscv1.best_estimator_ # In[20]: gscv1.best_score_ # In[21]: gscv1.score(X_test, y_test) # In[22]: from sklearn.linear_model import LogisticRegression lrmod = LogisticRegression(C= 10000) # low penalty lrmod.fit(X_train, y_train) # In[23]: lrmod.score(X_train, y_train) # In[24]: lrmod.score(X_test, y_test) # In[26]: parameters2 = {'penalty':('l1', 'l2'), 'C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'intercept_scaling':[1, 10, 50, 100, 500, 1000]} # increase intercept_scaling to minimize the penalty on the intercept np.random.seed(3535) gscv2 = GridSearchCV(LogisticRegression(), parameters2, cv= 5) # default is 3-fold gscv2.fit(X_normalized, y) # In[27]: gscv2.best_estimator_ # In[28]: gscv2.best_score_ # In[29]: gscv2.score(X_test, y_test) # Logistic regression performed slightly better, and is more easily interpretable, as well as being less computationally intensive. # ## Question 2 # # The file `TNNASHVI.txt` in your data directory contains daily temperature readings for Nashville, courtesy of the [Average Daily Temperature Archive](http://academic.udayton.edu/kissock/http/Weather/). This data, as one would expect, oscillates annually. Using either GPy or PyMC, use a Gaussian process to fit a non-parametric regression model to this data, choosing an appropriate covariance function. Plot 10 regression lines drawn from your process. # In[92]: get_ipython().run_line_magic('matplotlib', 'inline') from sklearn.datasets import load_diabetes import pandas as pd daily_temps = pd.read_table("/Users/fonnescj/Teaching/Bios8366/data/TNNASHVI.txt", sep='\s+', names=['month','day','year','temp'], na_values=-99) # In[93]: daily_temps.head() # In[94]: y = daily_temps.temp.dropna().values[:1000] N = len(y) x = np.arange(N)/1000 # In[95]: plt.plot(x, daily_temps.temp[:1000].values, 'b.') # In[120]: import GPy # In[122]: kernel = GPy.kern.PeriodicMatern32(1, variance=10, lengthscale=1.2) # In[124]: X = x.reshape(-1, 1) Y = y.reshape(-1, 1) # In[123]: K_xx = kernel.K(X,X) plt.imshow(np.asmatrix(K_xx), cmap='terrain', interpolation='nearest') plt.axis('off'); # In[173]: m = GPy.models.GPRegression(X=X, Y=Y, kernel=kernel) m.optimize() # In[174]: m.plot() plt.xlim(0, 1) plt.ylim(0,120); # In[184]: draws = m.posterior_samples(X, full_cov=True, size=10) # In[190]: y_sim, MSE = m.predict(X) # In[191]: plt.plot(x, draws, color='SeaGreen', alpha=0.1) plt.plot(x, y_sim - 2*MSE ** 0.5, 'SeaGreen') plt.plot(x, y_sim + 2*MSE ** 0.5, 'SeaGreen') plt.xlim(0, 1) plt.ylim(0,120); # ## Question 3 # # The data in `prostate.data.txt` come from a study by Stamey et al. (1989), which examined the correlation between the level of prostate-specific antigen (`lpsa`) and a number of clinical measures in men who were about to receive a radical prostatectomy. The variables are log cancer volume (`lcavol`), log prostate weight (`lweight`), age, log of the amount of benign prostatic hyperplasia (`lbph`), seminal vesicle invasion (`svi`), log of capsular penetration (`lcp`), Gleason score (`gleason`), and percent of Gleason scores 4 or 5 (`pgg45`). # # 1. Select (your choice) five competing 3-variable linear regression models, and compare them using AIC, five-fold and ten-fold cross-validation. Discuss the results. # # 2. An alternative method for model assessment is to fit the models on a set of bootstrap samples, and then keep track of how well it predicts the original training set. If $\hat{f}^b(x_i)$ is the predicted value at $x_i$, from the model fitted to the bth bootstrap dataset, such an estimate is: # $$\frac{1}{B} \frac{1}{N} \sum_{b=1}^B \sum_{i=1}^N L(y_i,\hat{f}^b(x_i)) $$ # However, because the bootstrap samples tend to contain many observations in common among the set of bootstrap samples, this estimate will tend to underestimate the true error rate. The so-called .632 estimator aleviates this bias by returning a weighted average of the training error (average loss over the training sample) and the leave-one-out (LOO) bootstrap error: # $$\hat{err}^{(.632)} = 0.368 \, \bar{err} + 0.632 \, \hat{err}^{(1)}$$ # where: # $$\bar{err} = \frac{1}{N}\sum_{i=1}^N L(y_i, \hat{f}(x_i)) $$ # Repeat the assesment from part (1) using the .632 estimator, and compare the result to the other approaches. # In[46]: # Part 1 prostate = pd.read_table('/Users/fonnescj/Teaching/Bios8366/data/prostate.data.txt', index_col=0) prostate.head() # In[47]: prostate.describe() # In[48]: pd.scatter_matrix(prostate, alpha=0.8, figsize=(12, 12), diagonal="kde", grid=False); # In[86]: aic = lambda rss, n, k: n*np.log(float(rss)/n) + 2*k # In[87]: from sklearn import linear_model subsets = [['lcavol', 'lweight', 'age'], ['lweight', 'age', 'lbph'], ['lbph', 'svi', 'lcp'], ['age', 'gleason', 'pgg45'], ['lcavol', 'svi', 'pgg45']] y = prostate.lpsa # In[88]: aic_values = [] for i,xvars in enumerate(subsets): X = prostate[xvars] regmod = linear_model.LinearRegression() regmod.fit(X, y) rss = ((regmod.predict(X) - y) ** 2).sum() aic_values.append(aic(rss, len(y), 3)) # In[89]: aic_values # In[90]: v = np.exp(-0.5*np.array(aic_values)) aic_weights = v / v.sum() aic_weights.round(2) # In[91]: five_fold = cross_validation.KFold(len(y), n_folds=5) ten_fold = cross_validation.KFold(len(y), n_folds=10) # In[92]: five_fold_scores = [] for i,xvars in enumerate(subsets): X = prostate[xvars] regmod = linear_model.LinearRegression() scores = cross_validation.cross_val_score(regmod, X, y, scoring='neg_mean_squared_error', cv=five_fold) five_fold_scores.append(scores.mean()) # Note that scikit-learn's MSE is negative. # In[93]: five_fold_scores # In[94]: ten_fold_scores = [] for i,xvars in enumerate(subsets): X = prostate[xvars] regmod = linear_model.LinearRegression() scores = cross_validation.cross_val_score(regmod, X, y, scoring='neg_mean_squared_error', cv=ten_fold) ten_fold_scores.append(scores.mean()) # In[95]: ten_fold_scores # The results for each are similar, and it can be shown that AIC is equivalent to a cross-validation outcome. AIC are more easily interpretable, since they can be turned into model probabilities. # In[96]: X.index.isin(np.random.choice(X.index, len(X)))^True # In[113]: # Part 2 B = 50 scores_632 = [] for i,xvars in enumerate(subsets): # Calculate bootstrap sample indices X = prostate[xvars].values y_ = y.values bootstrap_ind = [np.random.choice(range(len(X)), len(X)) for _ in range(B)] # Fit models to bootstrap samples regmod = linear_model.LinearRegression() bootstrap_fits = [regmod.fit(X[i], y_[i]) for i in bootstrap_ind] # Calculate loss for each observation over models that did not use it to fit the model loss = [np.mean([(regmod.predict(X[i].reshape(1,-1)) - y_[i])**2 for b in range(B) if i not in bootstrap_ind[b]]) for i in range(len(X))] # LOO error err_1 = np.mean(loss) # Naive (training) error err_bar = ((regmod.fit(X, y_).predict(X) - y_)**2).mean() scores_632.append(err_bar) # In[114]: scores_632 # Note that the above uses MSE and not negative MSE, so the lower values are better here. The results are comparable among the models. # In[115]: plt.plot(five_fold_scores) plt.plot(ten_fold_scores) plt.plot(-np.array(scores_632)) # ## Question 4 # # Fit a series of random-forest classifiers to the very low birthweight infant data (`vlbw.csv`), to explore the sensitivity to the parameter `m`, the number of variables considered for splitting at each step. Plot both the out-of-bag error as well as the test error against a suitably-chosen range of values for `m`. # In[116]: vlbw = pd.read_csv('/Users/fonnescj/Teaching/Bios8366/data/vlbw.csv', index_col=0) vlbw.ix[1] # In[117]: vlbw.ipe.unique() # In[118]: vlbw_numeric = vlbw.replace({'sex': {'female':0, 'male':1}, 'delivery': {'abdominal':0, 'vaginal':1}, 'inout': {'born at Duke':0, 'transported':1}, 'race': {'white':0, 'black':1, 'native American':2, 'oriental':3}, 'ivh': {'definite':1, 'absent':0, 'possible':1}, 'ipe': {'definite':1, 'absent':0, 'possible':1}, 'pvh': {'definite':1, 'absent':0, 'possible':1}}) # In[119]: vlbw_numeric.isnull().mean(0).round(2) # We will drop variables that have a lot of missing values, as well as the response variables # In[120]: vlbw_numeric = vlbw_numeric.drop(['lol', 'magsulf'], axis=1) # In[121]: vlbw_complete = vlbw_numeric[vlbw_numeric.ivh.notnull()] y = vlbw_complete.pop('ivh').values # In[122]: from sklearn.preprocessing import Imputer imp = Imputer(missing_values='NaN', strategy='median', axis=0) # In[123]: imp.fit(vlbw_complete) # In[124]: X = imp.transform(vlbw_complete) # In[125]: from sklearn.ensemble import RandomForestClassifier # In[126]: X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.4, random_state=0) # In[127]: m = [1, 3, 5, 10, 15, 20] oob_score = [] test_score = [] for mi in m: rfc = RandomForestClassifier(max_features=mi) rfc.fit(X_train, y_train) test_score.append(rfc.score(X_test, y_test)) rfc = RandomForestClassifier(max_features=mi, oob_score=True) rfc.fit(X_train, y_train) oob_score.append(rfc.score(X_test, y_test)) # In[128]: plt.plot(m, oob_score) plt.plot(m, test_score) # ## Bonus: Question 5 # # Use a grid search to optimize the number of estimators and max_depth for a Gradient Boosted Decision tree using the very low birthweight infant data. Plug this optimal ``max_depth`` into a *single* decision tree. Does this single tree over-fit or under-fit the data? Repeat this for the Random Forest. Construct a single decision tree using the ``max_depth`` which is optimal for the Random Forest. Does this single tree over-fit or under-fit the data? # In[129]: from sklearn import grid_search from sklearn.ensemble import GradientBoostingClassifier parameters = {'max_depth': [1, 2, 3, 5, 7], 'n_estimators': [25, 50, 100, 150]} gbc = GradientBoostingClassifier() gbc_cv = grid_search.GridSearchCV(gbc, parameters) # In[130]: gbc_cv.fit(X, y) # In[131]: best_gbc_params = gbc_cv.best_params_ # In[132]: gbc_single = GradientBoostingClassifier(max_depth=best_gbc_params['max_depth'], n_estimators=best_gbc_params['n_estimators']) gbc_single.fit(X_train, y_train) gbc_single.score(X_test, y_test) # In[133]: gbc_single.score(X_train, y_train) # In[134]: rfc = RandomForestClassifier() rf_cv = grid_search.GridSearchCV(rfc, parameters) # In[135]: rf_cv.fit(X, y) # In[136]: best_rf_params = rf_cv.best_params_ # In[137]: from sklearn import tree tree_classsifier = tree.DecisionTreeClassifier(max_depth=best_rf_params['max_depth']) tree_classsifier.fit(X_train, y_train) tree_classsifier.score(X_test, y_test) # In[138]: tree_classsifier.score(X_train, y_train)