from __future__ import division import pandas as pd import numpy as np import scipy as sp import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.linear_model import LogisticRegression from sklearn.metrics import mean_squared_error from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression from sklearn.feature_selection import chi2 from sklearn.cluster import MeanShift from scipy.interpolate import LSQUnivariateSpline %matplotlib inline # Data comes from ISLR::Wage - written using write.csv(Wage, "wage.csv", row.names=FALSE) wage_df = pd.read_csv("../data/Wage.csv") wage_df.head() # factorize non-numeric variables wage_df["sex"] = pd.factorize(wage_df["sex"])[0] wage_df["maritl"] = pd.factorize(wage_df["maritl"])[0] wage_df["race"] = pd.factorize(wage_df["race"])[0] wage_df["education"] = pd.factorize(wage_df["education"])[0] wage_df["region"] = pd.factorize(wage_df["region"])[0] wage_df["health"] = pd.factorize(wage_df["health"])[0] wage_df["health_ins"] = pd.factorize(wage_df["health_ins"])[0] wage_df.head() def poly(x, degree = 1): n = degree + 1 x = np.asarray(x).flatten() if(degree >= len(np.unique(x))): print("'degree' must be less than number of unique points") return xbar = np.mean(x) x = x - xbar X = np.fliplr(np.vander(x, n)) q, r = np.linalg.qr(X) z = np.diag(np.diag(r)) raw = np.dot(q, z) norm2 = np.sum(raw ** 2, axis=0) alpha = (np.sum((raw ** 2) * np.reshape(x,(-1,1)), axis=0) / norm2 + xbar)[:degree] Z = raw / np.sqrt(norm2) return Z, norm2, alpha X = poly(wage_df["age"].values, 4)[0] X[0:5, 1:] y = wage_df["wage"].values reg = LinearRegression() reg.fit(X, y) print "Intercepts:", reg.intercept_ print "Coefficients:", reg.coef_ ax = wage_df.plot(x="age", y="wage", style="o") ax.set_ylabel("wage") # The poly() method cannot return features for a single X value, so we have # to plot the raw predictions. age = wage_df["age"].values ypred = reg.predict(X) polyline = np.poly1d(np.polyfit(age, ypred, 4)) xs = range(int(np.min(age)), int(np.max(age))) ys = polyline(xs) ax.plot(xs, ys, 'r', linewidth=2.5) x1 = wage_df["age"].values x2 = np.power(x1, 2) x3 = np.power(x1, 3) x4 = np.power(x1, 4) X = np.vstack((x1, x2, x3, x4)).T X[0:5, :] y = wage_df["wage"].values reg = LinearRegression() reg.fit(X, y) print "Intercepts:", reg.intercept_ print "Coefficients:", reg.coef_ ax = wage_df.plot(x="age", y="wage", style="o") ax.set_ylabel("wage") xs = range(int(np.min(x1)), int(np.max(x1))) ys = [reg.predict([x, x**2, x**3, x**4]) for x in xs] ax.plot(xs, ys, 'r', linewidth=2.5) selector = SelectKBest(score_func=f_regression, k=10) selector.fit(X, y) (selector.scores_, selector.pvalues_) X = np.vstack((x1, x2, x3)).T y = wage_df["wage"].map(lambda w: 1 if w > 250 else 0).values reg = LogisticRegression() reg.fit(X, y) print "Intercepts:", reg.intercept_ print "Coefficients:", reg.coef_ ypred = reg.predict(X) print "MSE:", mean_squared_error(y, ypred) # Plot the predicted probability of being a high-earner over age range ages = wage_df["age"].values xs = range(np.min(ages), np.max(ages)) ys = [reg.predict_proba(np.array([x, x*x, x*x*x]))[0][0] for x in xs] plt.plot(xs, ys) plt.xlabel("age") plt.ylabel("p(wage > 250k)") ages = wage_df["age"].values wages = wage_df["wage"].values X = np.vstack((ages, wages)).T # cluster points to find the knot location msc = MeanShift() msc.fit(X) knots = msc.cluster_centers_[:, 0] # fit a spline over the points spl = LSQUnivariateSpline(ages, wages, knots) xs = range(np.min(ages), np.max(ages)) ys = spl(xs) # plot the points and the spline ax = wage_df.plot(x="age", y="wage", style="o") ax.set_ylabel("wage") ax.plot(xs, ys, 'r', linewidth=2.5)