from __future__ import division import pandas as pd import numpy as np import scipy as sp import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression from sklearn.metrics import mean_squared_error from sklearn.cross_validation import KFold %matplotlib inline hitters_df = pd.read_csv("../data/Hitters.csv") hitters_df.dropna(inplace=True) hitters_df.head() # Converting non-numeric input variables to factors. hitters_df["League"] = pd.factorize(hitters_df["League"])[0] hitters_df["Division"] = pd.factorize(hitters_df["Division"])[0] hitters_df["NewLeague"] = pd.factorize(hitters_df["NewLeague"])[0] hitters_df.head() # construct a baseline regressor with all features collist = [col for col in hitters_df.columns if col != "Salary"] X = hitters_df[collist] y = hitters_df["Salary"] reg = LinearRegression() reg.fit(X, y) ypred = reg.predict(X) np.sqrt(mean_squared_error(ypred, y)) mses = [] nfeatures = range(1, len(collist)) for nfeature in nfeatures: # compute MSE for different values of k (top features) selector = SelectKBest(f_regression, k=nfeature) selector.fit(X, y) selected = selector.get_support() feats = [col for (col,sel) in zip(collist, selected) if sel] reg = LinearRegression() X_r = hitters_df[feats] reg.fit(X_r, y) ypred = reg.predict(X_r) mses.append(np.sqrt(mean_squared_error(ypred, y))) plt.plot(nfeatures, mses) plt.xlabel("k") plt.ylabel("RMSE") cv_errors = [] kfold = KFold(len(hitters_df), n_folds=10) nfeatures = range(1, len(collist)) for nfeature in nfeatures: # build model with varying number of features selector = SelectKBest(f_regression, k=nfeature) selector.fit(X, y) selected = selector.get_support() feats = [col for (col,sel) in zip(collist, selected) if sel] X_r = hitters_df[feats].values y = hitters_df["Salary"].values rmses = [] for train, test in kfold: # each model is cross validated 10 times Xtrain, ytrain, Xtest, ytest = X_r[train], y[train], X_r[test], y[test] reg = LinearRegression() reg.fit(Xtrain, ytrain) ypred = reg.predict(Xtest) rmses.append(np.sqrt(mean_squared_error(ypred, ytest))) cv_errors.append(np.mean(rmses)) plt.plot(nfeatures, cv_errors) plt.xlabel("k") plt.ylabel("RMSE") def cross_validate(X, y, nfolds, reg_name): rmses = [] kfold = KFold(X.shape[0], n_folds=nfolds) for train, test in kfold: Xtrain, ytrain, Xtest, ytest = X[train], y[train], X[test], y[test] reg = None if reg_name == "ridge": reg = Ridge() elif reg_name == "lasso": reg = Lasso() else: reg = LinearRegression() reg.fit(Xtrain, ytrain) ypred = reg.predict(Xtest) rmses.append(np.sqrt(mean_squared_error(ytest, ypred))) return np.mean(rmses) collist = [col for col in hitters_df.columns if col != "Salary"] X = hitters_df[collist].values y = hitters_df["Salary"].values rmse_baseline = cross_validate(X, y, 10, "baseline") rmse_ridge = cross_validate(X, y, 10, "ridge") rmse_lasso = cross_validate(X, y, 10, "lasso") (rmse_baseline, rmse_ridge, rmse_lasso) cv_errors = [] alphas = [0.1 * alpha for alpha in range(1, 200, 20)] kfold = KFold(X.shape[0], n_folds=10) for alpha in alphas: rmses = [] for train, test in kfold: Xtrain, ytrain, Xtest, ytest = X[train], y[train], X[test], y[test] reg = Lasso(alpha=alpha) reg.fit(Xtrain, ytrain) ypred = reg.predict(Xtest) rmses.append(np.sqrt(mean_squared_error(ytest, ypred))) cv_errors.append(np.mean(rmses)) plt.plot(alphas, cv_errors) plt.xlabel("alpha") plt.ylabel("RMSE")