%matplotlib inline from collections import defaultdict import json import numpy as np import scipy as sp import matplotlib.pyplot as plt import pandas as pd from matplotlib import rcParams import matplotlib.cm as cm import matplotlib as mpl #colorbrewer2 Dark2 qualitative color table dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'white' rcParams['patch.facecolor'] = dark2_colors[0] rcParams['font.family'] = 'StixGeneral' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() pd.set_option('display.width', 500) pd.set_option('display.max_columns', 100) states_abbrev_dict = { 'AK': 'Alaska', 'AL': 'Alabama', 'AR': 'Arkansas', 'AS': 'American Samoa', 'AZ': 'Arizona', 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DC': 'District of Columbia', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia', 'GU': 'Guam', 'HI': 'Hawaii', 'IA': 'Iowa', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'MA': 'Massachusetts', 'MD': 'Maryland', 'ME': 'Maine', 'MI': 'Michigan', 'MN': 'Minnesota', 'MO': 'Missouri', 'MP': 'Northern Mariana Islands', 'MS': 'Mississippi', 'MT': 'Montana', 'NA': 'National', 'NC': 'North Carolina', 'ND': 'North Dakota', 'NE': 'Nebraska', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NV': 'Nevada', 'NY': 'New York', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'PR': 'Puerto Rico', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VA': 'Virginia', 'VI': 'Virgin Islands', 'VT': 'Vermont', 'WA': 'Washington', 'WI': 'Wisconsin', 'WV': 'West Virginia', 'WY': 'Wyoming' } abbrev_states_dict = {v: k for k, v in states_abbrev_dict.items()} census_data = pd.read_csv("./data/census_demographics.csv") census_data.head() def capitalize(s): s = s.title() s = s.replace("Of", "of") return s census_data["State"] = census_data.state.map(capitalize) del census_data["state"] census_data['State']=census_data['State'].replace(abbrev_states_dict) census_data.set_index("State", inplace=True) census_data.head() smaller_frame=census_data[['educ_coll', 'average_income', 'per_vote']] from pandas.tools.plotting import scatter_matrix axeslist=scatter_matrix(smaller_frame, alpha=0.8, figsize=(12, 12), diagonal="kde") for ax in axeslist.flatten(): ax.grid(False) smaller_frame.corr() from sklearn.linear_model import LinearRegression X_HD=smaller_frame[['educ_coll', 'average_income']].values X_HDn=(X_HD - X_HD.mean(axis=0))/X_HD.std(axis=0) educ_coll_std_vec=X_HDn[:,0] educ_coll_std=educ_coll_std_vec.reshape(-1,1) average_income_std_vec=X_HDn[:,1] average_income_std=average_income_std_vec.reshape(-1,1) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(educ_coll_std, average_income_std_vec) clf1 = LinearRegression() clf1.fit(X_train, y_train) predicted_train = clf1.predict(X_train) predicted_test = clf1.predict(X_test) trains=X_train.reshape(1,-1).flatten() tests=X_test.reshape(1,-1).flatten() print clf1.coef_, clf1.intercept_ plt.scatter(educ_coll_std_vec, average_income_std_vec,c='r') plt.plot(trains, predicted_train, c='g', alpha=0.5) plt.plot(tests, predicted_test, c='g', alpha=0.2) plt.scatter(predicted_test, predicted_test- y_test, c='g', s=40) plt.scatter(predicted_train, predicted_train- y_train, c='b', s=40, alpha=0.5) plt.plot([0.4,2],[0,0]) clf1.score(X_train, y_train), clf1.score(X_test, y_test) from sklearn.decomposition import PCA pca = PCA(n_components=2) X = pca.fit_transform(X_HDn) print pca.explained_variance_ratio_ plt.scatter(X[:, 0], X[:, 1]) pca1 = PCA(n_components=1) # only keep one dimension! X_E = pca1.fit_transform(X_HDn) X_reconstructed = pca1.inverse_transform(X_E) plt.scatter(X_reconstructed[:,0], X_reconstructed[:,1],c='b', s=35, alpha=0.7) plt.scatter(educ_coll_std_vec, average_income_std_vec, s=40, c='r', alpha=0.6) plt.plot(trains, predicted_train, c='g', alpha=0.3) plt.plot(tests, predicted_test, c='g', alpha=0.3) #your code here clf2 = LinearRegression() clf2.fit(average_income_std, educ_coll_std_vec) print clf2.coef_,clf2.intercept_ plt.plot(average_income_std_vec, clf2.predict(average_income_std)) plt.scatter(average_income_std_vec, educ_coll_std_vec) from IPython.display import Image as Im from IPython.display import display Im('./data/shuttle.png') data=np.array([[float(j) for j in e.strip().split()] for e in open("./data/chall.txt")]) data temps, pfail = data[:,0], data[:,1] plt.scatter(temps, pfail) axes=plt.gca() axes.grid(False) remove_border(axes) from sklearn.linear_model import LogisticRegression reg=1000. clf4 = LogisticRegression(C=reg) clf4.fit(temps.reshape(-1,1), pfail) tempsnew=np.linspace(20., 90., 15) probs = clf4.predict_proba(tempsnew.reshape(-1,1))[:, 1] predicts = clf4.predict(tempsnew.reshape(-1,1)) plt.scatter(temps, pfail) axes=plt.gca() axes.grid(False) remove_border(axes) plt.plot(tempsnew, probs, marker='s') plt.scatter(tempsnew, predicts, marker='s', color="green") pd.crosstab(pfail, clf4.predict(temps.reshape(-1,1)), rownames=["Actual"], colnames=["Predicted"]) #your code here clf4w = LogisticRegression() clf4w.fit(temps.reshape(-1,1), pfail) probsw = clf4w.predict_proba(tempsnew.reshape(-1,1))[:, 1] predictsw = clf4w.predict(tempsnew.reshape(-1,1)) plt.scatter(temps, pfail) axes=plt.gca() axes.grid(False) remove_border(axes) plt.plot(tempsnew, probsw, marker='s') plt.scatter(tempsnew, predictsw, marker='s', color="green") #your code here pd.crosstab(pfail, clf4w.predict(temps.reshape(-1,1)), rownames=["Actual"], colnames=["Predicted"]) from sklearn.linear_model import LogisticRegression def fit_logistic(X_train, y_train, reg=0.0001, penalty="l2"): clf = LogisticRegression(C=reg, penalty=penalty) clf.fit(X_train, y_train) return clf from sklearn.grid_search import GridSearchCV def cv_optimize(X_train, y_train, paramslist, penalty="l2", n_folds=10): clf = LogisticRegression(penalty=penalty) parameters = {"C": paramslist} gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds) gs.fit(X_train, y_train) return gs.best_params_, gs.best_score_ def cv_and_fit(X_train, y_train, paramslist, penalty="l2", n_folds=5): bp, bs = cv_optimize(X_train, y_train, paramslist, penalty=penalty, n_folds=n_folds) print "BP,BS", bp, bs clf = fit_logistic(X_train, y_train, penalty=penalty, reg=bp['C']) return clf clf=cv_and_fit(temps.reshape(-1,1), pfail, np.logspace(-4, 3, num=100)) pd.crosstab(pfail, clf.predict(temps.reshape(-1,1)), rownames=["Actual"], colnames=["Predicted"]) plt.scatter(temps, pfail, s=40) axes=plt.gca() axes.grid(False) remove_border(axes) probs=clf.predict_proba(tempsnew.reshape(-1,1))[:,1] predicts=clf.predict(tempsnew.reshape(-1,1)) plt.plot(tempsnew, probs, marker='s') plt.scatter(tempsnew, predicts, marker='D', color="green", s=80, alpha=0.4) train_probs=clf.predict_proba(temps.reshape(-1,1))[:,1] plt.scatter(temps, train_probs, marker='s', c='r', alpha=0.5, s=40) train_predicts=clf.predict(temps.reshape(-1,1)) plt.scatter(temps, train_predicts, marker='s', c='r', alpha=0.2, s=80) zip(temps,pfail, clf.predict(temps.reshape(-1,1))) Im('./data/chall-table.png') from PIL import Image #setup a standard image size; this will distort some images but will get everything into the same shape STANDARD_SIZE = (322, 137) def img_to_matrix(filename, verbose=False): """ takes a filename and turns it into a numpy array of RGB pixels """ img = Image.open(filename) if verbose==True: print "changing size from %s to %s" % (str(img.size), str(STANDARD_SIZE)) img = img.resize(STANDARD_SIZE) img = list(img.getdata()) img = map(list, img) img = np.array(img) return img def flatten_image(img): """ takes in an (m, n) numpy array and flattens it into an array of shape (1, m * n) """ s = img.shape[0] * img.shape[1] img_wide = img.reshape(1, s) return img_wide[0] import os checks_dir = "./data/images/images/checks/" dollars_dir = "./data/images/images/dollars/" def images(img_dir): return [img_dir+f for f in os.listdir(img_dir)] checks=images(checks_dir) dollars=images(dollars_dir) images=checks+dollars labels = ["check" for i in range(len(checks))] + ["dollar" for i in range(len(dollars))] len(labels), len(images) for i in range(3): display(Im(checks[i])) for i in range(3): display(Im(dollars[i])) i0=images[20] display(Im(i0)) i0m=img_to_matrix(i0) print i0m.shape plt.imshow(i0m[:,1].reshape(137,322)) data = [] for image in images: img = img_to_matrix(image) img = flatten_image(img) data.append(img) data = np.array(data) data.shape y = np.where(np.array(labels)=="check", 1, 0) y.shape def do_pca(d,n): pca = PCA(n_components=n) X = pca.fit_transform(d) print pca.explained_variance_ratio_ return X, pca X20, pca20=do_pca(data,20) np.sum(pca20.explained_variance_ratio_) X2, pca2=do_pca(data,2) df = pd.DataFrame({"x": X2[:, 0], "y": X2[:, 1], "label":np.where(y==1, "check", "dollar")}) colors = ["red", "yellow"] for label, color in zip(df['label'].unique(), colors): mask = df['label']==label plt.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label) plt.legend() def normit(a): a=(a - a.min())/(a.max() -a.min()) a=a*256 return np.round(a) def getRGB(o): size=322*137*3 r=o[0:size:3] g=o[1:size:3] b=o[2:size:3] r=normit(r) g=normit(g) b=normit(b) return r,g,b def getNC(pc, j): return getRGB(pc.components_[j]) def getMean(pc): m=pc.mean_ return getRGB(m) def display_from_RGB(r, g, b): rgbArray = np.zeros((137,322,3), 'uint8') rgbArray[..., 0] = r.reshape(137,322) rgbArray[..., 1] = g.reshape(137,322) rgbArray[..., 2] = b.reshape(137,322) img = Image.fromarray(rgbArray) plt.imshow(np.asarray(img)) ax=plt.gca() ax.set_xticks([]) ax.set_yticks([]) return ax def display_component(pc, j): r,g,b = getNC(pc,j) return display_from_RGB(r,g,b) display_component(pca2,0) display_component(pca2,1) #your code here X5, pca5=do_pca(data, 5) #your code here np.sum(pca5.explained_variance_ratio_) #your code here for i in range(5): plt.figure() display_component(pca5,i) display_from_RGB(*getMean(pca5)) from matplotlib.colors import ListedColormap def points_plot(Xtr, Xte, ytr, yte, clf): X=np.concatenate((Xtr, Xte)) h = .02 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.linspace(x_min, x_max, 50), np.linspace(y_min, y_max, 50)) # just plot the dataset first cm = plt.cm.RdBu cm_bright = ListedColormap(['#FF0000', '#0000FF']) f,ax = plt.subplots() # Plot the training points ax.scatter(Xtr[:, 0], Xtr[:, 1], c=ytr, cmap=cm_bright) # and testing points ax.scatter(Xte[:, 0], Xte[:, 1], c=yte, cmap=cm_bright, marker="s", s=50, alpha=0.9) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.4) cs2 = ax.contour(xx, yy, Z, cmap=cm, alpha=.4) plt.clabel(cs2, fmt = '%2.1f', colors = 'k', fontsize=14) return ax is_train = np.random.uniform(0, 1, len(data)) <= 0.7 train_x, train_y = data[is_train], y[is_train] test_x, test_y = data[is_train==False], y[is_train==False] pca = PCA(n_components=2) train_x = pca.fit_transform(train_x) test_x = pca.transform(test_x) logreg = cv_and_fit(train_x, train_y, np.logspace(-4, 3, num=100)) pd.crosstab(test_y, logreg.predict(test_x), rownames=["Actual"], colnames=["Predicted"]) logreg.coef_, logreg.intercept_ points_plot(train_x, test_x, train_y, test_y, logreg) logreg_l1=cv_and_fit(train_x, train_y, np.logspace(-4, 3, num=100), penalty="l1") pd.crosstab(test_y, logreg_l1.predict(test_x), rownames=["Actual"], colnames=["Predicted"]) print logreg_l1.coef_, logreg_l1.intercept_ points_plot(train_x, test_x, train_y, test_y, logreg_l1) #your code here is_train = np.random.uniform(0, 1, len(data)) <= 0.7 train_x, train_y = data[is_train], y[is_train] test_x, test_y = data[is_train==False], y[is_train==False] pca = PCA(n_components=5) train_x = pca.fit_transform(train_x) test_x = pca.transform(test_x) #your code here logreg5=cv_and_fit(train_x, train_y, np.logspace(-4, 3, num=100)) pd.crosstab(test_y, logreg5.predict(test_x), rownames=["Actual"], colnames=["Predicted"]) #your code here print logreg5.coef_, logreg5.intercept_ #your code here logreg5_l1=cv_and_fit(train_x, train_y, np.logspace(-5, 3, num=100), penalty="l1") pd.crosstab(test_y, logreg5_l1.predict(test_x), rownames=["Actual"], colnames=["Predicted"]) #your code here print logreg5_l1.coef_, logreg5_l1.intercept_