!wget -nc --directory-prefix data \ https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data !head data/transfusion.data import numpy as np import pandas as pd df = pd.read_csv('data/transfusion.data') df.head() df.shape df.dtypes # save the current names in case we need them later original_column_names = df.columns # make the names less ugly names = ['recency', 'frequency', 'cc', 'time', 'donated'] df.columns = names df.head() # import our graphics tools %matplotlib inline import matplotlib as mpl from matplotlib import pyplot as plt import seaborn as sns # nice defaults for matplotlib styles set2 = sns.color_palette('Set2') # add on some settings from 'Bayesian Methods for Hackers' plt.style.use('bmh') # set larger default fonts for presentation-friendliness mpl.rc('figure', figsize=(10, 8)) mpl.rc('axes', labelsize=16, titlesize=20) from pandas.tools.plotting import scatter_matrix axeslist = scatter_matrix(df, alpha=0.8, figsize=(10, 10)) for ax in axeslist.flatten(): ax.grid(False) import numpy as np # create a figure with 4 subplots fig, axs = plt.subplots(nrows=2, ncols=2) feature_column_names = df.columns[:-1] label_column_name = df.columns[-1] for i, col in enumerate(feature_column_names): # get the current subplot to work on ax = axs.ravel()[i] # create some random y jitter to add jitter = np.random.uniform(low=-0.05, high=0.05, size=len(df)) # plot the data ax.scatter(x=df[col], y=df[label_column_name] + jitter, c=df.donated, cmap='coolwarm', alpha=0.5) # label the axes ax.set_xlabel(col) ax.set_ylabel(label_column_name) plt.tight_layout() plt.show() from sklearn.cross_validation import train_test_split # using conventional sklearn variable names X = df[feature_column_names].astype(float) y = df.donated.ravel() # break up the data into train and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) X_train.shape y_train.shape X_test.shape y_test.shape from sklearn.tree import DecisionTreeClassifier clf_tree = DecisionTreeClassifier(max_depth=3) clf_tree.fit(X_train, y_train) print 'Score:', clf_tree.score(X_test, y_test) # viz adapted from http://scikit-learn.org/stable/modules/tree.html import pydot from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz dot_data = StringIO() export_graphviz(clf_tree, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_png('output/decision_tree.png') from sklearn.linear_model import LogisticRegression clf = LogisticRegression(penalty='l2', fit_intercept=True) clf.fit(X_train, y_train) print 'Score:', clf.score(X_test, y_test) from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import LinearSVC other_clfs = { 'Random forest': RandomForestClassifier(), 'AdaBoost': AdaBoostClassifier(), 'Naive Bayes': MultinomialNB(), 'Linear SVC': LinearSVC(), 'KNN': KNeighborsClassifier(5), } # iterating through all of these models we want to fit ... for name, other_clf in other_clfs.iteritems(): # fit the model with the training data print other_clf.fit(X_train, y_train) # cross validation score print '---\nScore:', other_clf.score(X_test, y_test) print '\n' clf clf.coef_ clf.intercept_ clf.get_params() clf.predict(X_test) pd.DataFrame(clf.predict_proba(X_test))\ .head(10) clf.score(X_test, y_test) from sklearn import cross_validation # come up with random folds of the data kf = cross_validation.KFold(len(X), n_folds=5, shuffle=True) def plot_scores(scores): N = len(scores) plt.bar(np.arange(1, N + 1) - 0.4, scores, color=set2[2]) plt.title('{}-fold cross-validation scores'.format(N), fontsize=18) plt.xlabel('fold', fontsize=14) plt.ylabel('score', fontsize=14) plt.xlim(0.5, N + 0.5) plt.ylim(0, 1) plt.show() # evaluate the fitted model on each fold in turn, returns a score for each fold scores = cross_validation.cross_val_score(clf, X, y, cv=kf, n_jobs=1) print 'scores:', scores print 'average score:', np.mean(scores) plot_scores(scores) from sklearn.metrics import log_loss log_loss(y_test, clf.predict_proba(X_test)) from sklearn.metrics import f1_score f1_score(y_test, clf.predict(X_test)) from itertools import permutations from sklearn.metrics import confusion_matrix # get the raw confusion matrix cm = confusion_matrix(y_test, clf.predict(X_test)) # create a dataframe cmdf = pd.DataFrame(cm) cmdf.columns = map(lambda x: 'pred {}'.format(x), cmdf.columns) cmdf.index = map(lambda x: 'actual {}'.format(x), cmdf.index) cmdf from IPython.html.widgets import interact from sklearn.metrics import roc_curve, auc def plot_roc_curve(y_test, probas): # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_test, probas[:, 1]) roc_auc = auc(fpr, tpr) # Plot ROC curve plt.clf() plt.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate', fontsize=14) plt.ylabel('True Positive Rate', fontsize=14) plt.legend(loc="lower right", fontsize=20) plt.show() def fit_model(penalty, C): clf = LogisticRegression(penalty=penalty, C=C) clf.fit(X_train, y_train) plot_roc_curve(y_test, clf._predict_proba_lr(X_test)) interact(fit_model, penalty=('l2','l1'), C=(0.01, 100, 1)) from sklearn.grid_search import GridSearchCV params_to_try = { 'C': [0.01, 0.1, 1, 10, 100, 100], 'penalty': ['l1', 'l2'] } gs = GridSearchCV(clf, param_grid=params_to_try, cv=5) gs.fit(X, y) print "Best parameters:", gs.best_params_ print "Best score:", gs.best_score_ from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_train_standardized = scaler.fit_transform(X_train.astype(np.float)) X_train_standardized print 'column means:', np.round(X_train_standardized.mean(axis=0)) print 'column variances:', np.round(X_train_standardized.var(axis=0)) X_new = np.array([25., 35., 9200., 90.]) scaler.transform(X_new) from sklearn.decomposition import PCA # instantiate the PCA transformation object pca = PCA(n_components=2, whiten=True) # fit the PCA object on and transform the training data X_train_pca = pca.fit_transform(X_train_standardized) # create a 3d figure fig = plt.figure() ax = fig.add_subplot(111) # scatterplot the PCA points ax.scatter(*np.hsplit(X_train_pca, 2), c=y_train, s=40, cmap='coolwarm') # annotate and show the figure ax.set_xlabel('component 1') ax.set_ylabel('component 2') plt.show() from sklearn.pipeline import Pipeline # define a pipeline with some transforms and a simple classifier pipeline = Pipeline([ ('scale', StandardScaler()), ('reduce_dim', PCA()), ('clf', LogisticRegression()), ]) # enumerate all of the different settings we wish to try out parameters = { 'reduce_dim__n_components': (1, 2, 3), 'reduce_dim__whiten': (True, False), 'clf__penalty': ('l1', 'l2'), 'clf__C': (1e-3, 1e-2, 1, 1e1, 1e2, 1e3), } # grid search the parameter space grid_search = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1) print("Performing grid search...\n") print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters:") print(parameters) grid_search.fit(X, y) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) # load the %R cell magic extension %load_ext rmagic # send the dataframe over to the R instance %Rpush df %%R library(ggplot2) qplot(log(time), log(cc), data=df, color=donated) %%R blood.glm <- glm(donated ~ log(cc) + log(time), data=df, family="binomial") print(summary(blood.glm)) par(mfrow=c(2, 2)) plot(blood.glm) r_coeffs = %R coef(blood.glm) r_coeffs