%matplotlib inline import matplotlib.pyplot as plt import numpy as np import pandas as pd import warnings warnings.simplefilter('ignore', DeprecationWarning) #!curl -s https://dl.dropboxusercontent.com/u/5743203/data/titanic/titanic_train.csv | head -5 !head -5 titanic_train.csv #data = pd.read_csv('https://dl.dropboxusercontent.com/u/5743203/data/titanic/titanic_train.csv') data = pd.read_csv('titanic_train.csv') data.head(5) data.count() list(data.columns) data.shape data.values survived_column = data['Survived'] survived_column.dtype type(survived_column) type(data) data.groupby('Survived').count() np.mean(survived_column == 0) target = survived_column.values type(target) target.dtype target[:5] numerical_features = data.get(['Fare', 'Pclass', 'Age']) numerical_features.head(5) numerical_features.count() median_features = numerical_features.dropna().median() median_features imputed_features = numerical_features.fillna(median_features) imputed_features.count() imputed_features.head(5) features_array = imputed_features.values features_array features_array.dtype from sklearn.cross_validation import train_test_split features_train, features_test, target_train, target_test = train_test_split( features_array, target, test_size=0.20, random_state=0) features_train.shape features_test.shape target_train.shape target_test.shape from sklearn.linear_model import LogisticRegression logreg = LogisticRegression(C=1) logreg.fit(features_train, target_train) target_predicted = logreg.predict(features_test) from sklearn.metrics import accuracy_score accuracy_score(target_test, target_predicted) logreg.score(features_test, target_test) feature_names = numerical_features.columns feature_names logreg.coef_ x = np.arange(len(feature_names)) plt.bar(x, logreg.coef_.ravel()) _ = plt.xticks(x + 0.5, feature_names, rotation=30) from sklearn.metrics import confusion_matrix cm = confusion_matrix(target_test, target_predicted) print(cm) def plot_confusion(cm): plt.imshow(cm, interpolation='nearest', cmap=plt.cm.binary) plt.title('Confusion matrix') plt.set_cmap('Blues') plt.colorbar() target_names = ['not survived', 'survived'] tick_marks = np.arange(len(target_names)) plt.xticks(tick_marks, target_names, rotation=60) plt.yticks(tick_marks, target_names) plt.ylabel('True label') plt.xlabel('Predicted label') # Convenience function to adjust plot parameters for a clear layout. plt.tight_layout() plot_confusion(cm) print(cm) print(cm.astype(np.float64) / cm.sum(axis=1)) from sklearn.metrics import classification_report print(classification_report(target_test, target_predicted, target_names=['not survived', 'survived'])) target_predicted_proba = logreg.predict_proba(features_test) target_predicted_proba[:5] from sklearn.metrics import roc_curve from sklearn.metrics import auc def plot_roc_curve(target_test, target_predicted_proba): fpr, tpr, thresholds = roc_curve(target_test, target_predicted_proba[:, 1]) roc_auc = auc(fpr, tpr) # Plot ROC curve plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') # random predictions curve plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate or (1 - Specifity)') plt.ylabel('True Positive Rate or (Sensitivity)') plt.title('Receiver Operating Characteristic') plt.legend(loc="lower right") plot_roc_curve(target_test, target_predicted_proba) features_train, features_test, target_train, target_test = train_test_split( features_array, target, test_size=0.20, random_state=0) logreg.fit(features_train, target_train).score(features_test, target_test) features_train, features_test, target_train, target_test = train_test_split( features_array, target, test_size=0.20, random_state=1) logreg.fit(features_train, target_train).score(features_test, target_test) features_train, features_test, target_train, target_test = train_test_split( features_array, target, test_size=0.20, random_state=2) logreg.fit(features_train, target_train).score(features_test, target_test) from sklearn.cross_validation import cross_val_score scores = cross_val_score(logreg, features_array, target, cv=5) scores scores.min(), scores.mean(), scores.max() scores = cross_val_score(logreg, features_array, target, cv=5, scoring='roc_auc') scores.min(), scores.mean(), scores.max() pd.get_dummies(data.Sex, prefix='Sex').head(5) pd.get_dummies(data.Embarked, prefix='Embarked').head(5) rich_features = pd.concat([data.get(['Fare', 'Pclass', 'Age']), pd.get_dummies(data.Sex, prefix='Sex'), pd.get_dummies(data.Embarked, prefix='Embarked')], axis=1) rich_features.head(5) rich_features_no_male = rich_features.drop('Sex_male', 1) rich_features_no_male.head(5) rich_features_final = rich_features_no_male.fillna(rich_features_no_male.dropna().median()) rich_features_final.head(5) %%time from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_score logreg = LogisticRegression(C=1) scores = cross_val_score(logreg, rich_features_final, target, cv=5, scoring='accuracy') print("Logistic Regression CV scores:") print("min: {:.3f}, mean: {:.3f}, max: {:.3f}".format( scores.min(), scores.mean(), scores.max())) %load solutions/04A_plot_logistic_regression_weights.py %%time from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=100) scores = cross_val_score(rf, rich_features_final, target, cv=5, n_jobs=4, scoring='accuracy') print("Random Forest CV scores:") print("min: {:.3f}, mean: {:.3f}, max: {:.3f}".format( scores.min(), scores.mean(), scores.max())) %%time from sklearn.ensemble import GradientBoostingClassifier gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, subsample=.8, max_features=.5) scores = cross_val_score(gb, rich_features_final, target, cv=5, n_jobs=4, scoring='accuracy') print("Gradient Boosted Trees CV scores:") print("min: {:.3f}, mean: {:.3f}, max: {:.3f}".format( scores.min(), scores.mean(), scores.max())) %load solutions/04B_more_categorical_variables.py %load solutions/04C_feature_importance.py %%time from sklearn.grid_search import GridSearchCV gb = GradientBoostingClassifier(n_estimators=100, subsample=.8) params = { 'learning_rate': [0.05, 0.1, 0.5], 'max_features': [0.5, 1], 'max_depth': [3, 4, 5], } gs = GridSearchCV(gb, params, cv=5, scoring='roc_auc', n_jobs=4) gs.fit(rich_features_final, target) sorted(gs.grid_scores_, key=lambda x: x.mean_validation_score, reverse=True) gs.best_score_ gs.best_params_ features = pd.concat([data.get(['Fare', 'Age']), pd.get_dummies(data.Sex, prefix='Sex'), pd.get_dummies(data.Pclass, prefix='Pclass'), pd.get_dummies(data.Embarked, prefix='Embarked')], axis=1) features = features.drop('Sex_male', 1) # Because of the following bug we cannot use NaN as the missing # value marker, use a negative value as marker instead: # https://github.com/scikit-learn/scikit-learn/issues/3044 features = features.fillna(-1) features.head(5) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(features.values, target, random_state=0) from sklearn.preprocessing import Imputer imputer = Imputer(strategy='median', missing_values=-1) imputer.fit(X_train) imputer.statistics_ X_train_imputed = imputer.transform(X_train) X_test_imputed = imputer.transform(X_test) np.any(X_train == -1) np.any(X_train_imputed == -1) np.any(X_test == -1) np.any(X_test_imputed == -1) from sklearn.pipeline import Pipeline imputer = Imputer(strategy='median', missing_values=-1) classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, subsample=.8, max_features=.5) pipeline = Pipeline([ ('imp', imputer), ('clf', classifier), ]) scores = cross_val_score(pipeline, features.values, target, cv=5, n_jobs=4, scoring='accuracy', ) print(scores.min(), scores.mean(), scores.max()) %%time params = { 'imp__strategy': ['mean', 'median'], 'clf__max_features': [0.5, 1], 'clf__max_depth': [3, 4, 5], } gs = GridSearchCV(pipeline, params, cv=5, scoring='roc_auc', n_jobs=4) gs.fit(X_train, y_train) sorted(gs.grid_scores_, key=lambda x: x.mean_validation_score, reverse=True) gs.best_score_ plot_roc_curve(y_test, gs.predict_proba(X_test)) gs.best_params_