import pandas as pd import numpy as np import scipy as sp import statsmodels.api as sm import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression %matplotlib inline # Data from R ISLR package - write.csv(Boston, "Boston.csv", col.names = FALSE) boston_df = pd.read_csv("../data/Boston.csv") boston_df.head() # LSTAT - % of population with low status; MEDV - median value of home ax = boston_df.plot(x="lstat", y="medv", style="o") ax.set_ylabel("medv") # The statsmodels library provides a small subset of models, but has more emphasis on # parameter estimation and statistical testing. The summary output is similar to R's # summary function. # X is an "array" of column values, y is a single column value X = boston_df[["lstat"]].values X = sm.add_constant(X) # add the intercept term y = boston_df["medv"].values ols = sm.OLS(y, X).fit() ols.summary() # Scikit Learn provides a larger number of models, but has more of a Machine Learning POV # and doesn't come with the statistical testing data shown above. However, it produces an # identical linear model as shown below: reg = LinearRegression() X = boston_df[["lstat"]].values y = boston_df["medv"].values reg.fit(X, y) (reg.intercept_, reg.coef_) # Drawing the regression line on top of the scatterplot ax = boston_df.plot(x="lstat", y="medv", style="o") ax.set_ylabel("medv") lstats = boston_df["lstat"].values xs = range(int(np.min(X[:,0])), int(np.max(X[:,0]))) ys = [reg.predict([x]) for x in xs] ax.plot(xs, ys, 'r', linewidth=2.5) # Prediction test_data = [[5], [10], [15]] reg.predict(test_data) # regression with 2 input columns X = boston_df[["lstat", "age"]] reg2 = LinearRegression() reg2.fit(X, y) (reg2.intercept_, reg2.coef_) # regression using all input columns xcols = boston_df.columns[0:-1] X = boston_df[xcols] reg3 = LinearRegression() reg3.fit(X, y) (reg3.intercept_, reg3.coef_) # Plotting a fitted regression with R returns 4 graphs - Residuals vs Fitted, Normal Q-Q, # Scale-Location (Standardized Residuals vs Fitted), and Residuals vs Leverage. Only the # Q-Q plot is available from statsmodels. The residuals vs Fitted function is implemented # below and is used for plot #1 and #3. The Residuals vs Leverage is TBD. def residuals_vs_fitted(fitted, residuals, xlabel, ylabel): plt.subplot(111) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.scatter(fitted, residuals) polyline = np.poly1d(np.polyfit(fitted, residuals, 2)) # model non-linearity with quadratic xs = range(int(np.min(fitted)), int(np.max(fitted))) plt.plot(xs, polyline(xs), color='r', linewidth=2.5) def qq_plot(residuals): sm.qqplot(residuals) def standardize(xs): xmean = np.mean(xs) xstd = np.std(xs) return (xs - xmean) / xstd fitted = reg3.predict(X) residuals = y - fitted std_residuals = standardize(residuals) residuals_vs_fitted(fitted, residuals, "Fitted", "Residuals") fig = sm.qqplot(residuals, dist="norm", line="r") residuals_vs_fitted(fitted, std_residuals, "Fitted", "Std.Residuals") # fitting medv ~ lstat * age boston_df["lstat*age"] = boston_df["lstat"] * boston_df["age"] reg5 = LinearRegression() X = boston_df[["lstat", "age", "lstat*age"]] y = boston_df["medv"] reg5.fit(X, y) (reg5.intercept_, reg5.coef_) fitted = reg5.predict(X) residuals = y - fitted std_residuals = standardize(residuals) residuals_vs_fitted(fitted, residuals, "Fitted", "Residuals") # fitting medv ~ lstat + I(lstat^2) boston_df["lstat^2"] = boston_df["lstat"] ** 2 reg6 = LinearRegression() X = boston_df[["lstat", "lstat^2"]] y = boston_df["medv"] reg6.fit(X, y) # save the predicted ys for given xs for future plot lstats = boston_df["lstat"].values xs = range(int(np.min(lstats)), int(np.max(lstats))) ys6 = [reg6.predict([x, x*x]) for x in xs] (reg6.intercept_, reg6.coef_) fitted = reg6.predict(X) residuals = y - fitted std_residuals = standardize(residuals) residuals_vs_fitted(fitted, residuals, "Fitted", "Residuals") # fitting medv ~ poly(lstat,4). We already have lstat^2 and lstat from previous boston_df["lstat^4"] = np.power(boston_df["lstat"], 4) boston_df["lstat^3"] = np.power(boston_df["lstat"], 4) X = boston_df[["lstat^4", "lstat^3", "lstat^2", "lstat"]] y = boston_df["medv"] reg7 = LinearRegression() reg7.fit(X, y) ys7 = [reg7.predict([x**4, x**3, x**2, x]) for x in xs] (reg7.intercept_, reg7.coef_) fitted = reg7.predict(X) residuals = y - fitted std_residuals = standardize(residuals) residuals_vs_fitted(fitted, residuals, "Fitted", "Residuals") # Plot the different lines. Not that the green line (reg7) follows the distribution # better than the red line (reg6). ax = boston_df.plot(x="lstat", y="medv", style="o") ax.set_ylabel("medv") plt.plot(xs, ys6, color='r', linewidth=2.5) plt.plot(xs, ys7, color='g', linewidth=2.5) # Data from ISLR package: write.csv(Carseats, 'Carseats.csv', col.names=FALSE) carseats_df = pd.read_csv("../data/Carseats.csv") carseats_df.head() # convert non-numeric to factors carseats_df["ShelveLoc"] = pd.factorize(carseats_df["ShelveLoc"])[0] carseats_df["Urban"] = pd.factorize(carseats_df["Urban"])[0] carseats_df["US"] = pd.factorize(carseats_df["US"])[0] # Sales ~ . + Income:Advertising + Age:Price carseats_df["Income:Advertising"] = carseats_df["Income"] * carseats_df["Advertising"] carseats_df["Age:Price"] = carseats_df["Age"] * carseats_df["Price"] X = carseats_df[carseats_df[1:].columns] y = carseats_df["Sales"] reg = LinearRegression() reg.fit(X, y) (reg.intercept_, reg.coef_) # R has a contrasts() function that shows how factors are encoded by default. We can do # this manually using scikit-learn's OneHotEncoder from sklearn.preprocessing import OneHotEncoder colnames = ["ShelveLoc", "Urban", "US"] enc = OneHotEncoder() X = carseats_df[colnames] enc.fit(X) X_tr = enc.transform(X).toarray() colnos = enc.n_values_ colnames_tr = [] for (idx, colname) in enumerate(colnames): for i in range(0, colnos[idx]): colnames_tr.append(colname + "_" + str(i)) col = 0 for colname_tr in colnames_tr: carseats_df[colname_tr] = X_tr[:, col] col = col + 1 del carseats_df["ShelveLoc"] del carseats_df["Urban"] del carseats_df["US"] carseats_df[colnames_tr].head() def regplot(x, y, xlabel, ylabel, dot_style, line_color): x = x.values y = y.values reg = LinearRegression() X = np.matrix(x).T reg.fit(X, y) ax = plt.scatter(x, y, marker=dot_style) plt.xlabel(xlabel) plt.ylabel(ylabel) xs = range(int(np.min(x)), int(np.max(x))) ys = [reg.predict(x) for x in xs] plt.plot(xs, ys, color=line_color, linewidth=2.5) regplot(carseats_df["Price"], carseats_df["Sales"], "Price", "Sales", 'o', 'r')