import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC, SVC
import classification_helper
%matplotlib inline
plt.style.use('seaborn-white')
df_heart = pd.read_csv('Data/Heart.csv', index_col=0).dropna()
for cat_col in ['ChestPain', 'Thal', 'AHD']:
df_heart[cat_col] = df_heart[cat_col].astype('category')
print(f'{cat_col}: {df_heart[cat_col].cat.categories.values}')
df_heart.head(3)
ChestPain: ['asymptomatic' 'nonanginal' 'nontypical' 'typical'] Thal: ['fixed' 'normal' 'reversable'] AHD: ['No' 'Yes']
Age | Sex | ChestPain | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | Thal | AHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 63 | 1 | typical | 145 | 233 | 1 | 2 | 150 | 0 | 2.3 | 3 | 0.0 | fixed | No |
2 | 67 | 1 | asymptomatic | 160 | 286 | 0 | 2 | 108 | 1 | 1.5 | 2 | 3.0 | normal | Yes |
3 | 67 | 1 | asymptomatic | 120 | 229 | 0 | 2 | 129 | 1 | 2.6 | 2 | 2.0 | reversable | Yes |
X = pd.get_dummies(df_heart.drop(['AHD'], axis=1), drop_first=True)
y = df_heart.AHD.map({'Yes': 1, 'No': 0})
X.head(3)
Age | Sex | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | ChestPain_nonanginal | ChestPain_nontypical | ChestPain_typical | Thal_normal | Thal_reversable | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 63 | 1 | 145 | 233 | 1 | 2 | 150 | 0 | 2.3 | 3 | 0.0 | 0 | 0 | 1 | 0 | 0 |
2 | 67 | 1 | 160 | 286 | 0 | 2 | 108 | 1 | 1.5 | 2 | 3.0 | 0 | 0 | 0 | 1 | 0 |
3 | 67 | 1 | 120 | 229 | 0 | 2 | 129 | 1 | 2.6 | 2 | 2.0 | 0 | 0 | 0 | 0 | 1 |
y.head()
1 0 2 1 3 1 4 0 5 0 Name: AHD, dtype: int64
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=90)
# LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_pred_LDA = lda.predict_proba(X_test)[:,1]
*roc_LDA, _ = roc_curve(y_test, y_pred_LDA)
roc_auc_LDA = auc(*roc_LDA)
# SVC
svc = make_pipeline(StandardScaler(), LinearSVC())
svc.fit(X_train, y_train)
y_pred_SVC = svc.decision_function(X_test)
*roc_SVC, _ = roc_curve(y_test, y_pred_SVC)
roc_auc_SVC = auc(*roc_SVC)
# SVM
gammas = [1e-3, 1e-2, 1e-1]
roc_SVMs = []
roc_auc_SVMs = []
for gamma in gammas:
svm = make_pipeline(StandardScaler(), SVC(kernel='rbf', gamma=gamma))
svm.fit(X_train, y_train)
y_pred_SVM = svm.decision_function(X_test)
*roc_SVM, _ = roc_curve(y_test, y_pred_SVM)
roc_auc_SVM = auc(*roc_SVM)
roc_SVMs.append(roc_SVM)
roc_auc_SVMs.append(roc_auc_SVM)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,4))
# LDA vs SVC
ax1.plot(np.linspace(0, 1.0, 100), np.linspace(0, 1.0, 100), color='black', linestyle='--', lw=1, label='Random')
ax1.plot(*roc_LDA, color='blue', lw=1, label=f'LDA (area = {roc_auc_LDA:0.2f})')
ax1.plot(*roc_SVC, color='red', lw=1, label=f'SVC (area = {roc_auc_SVC:0.2f})')
ax1.legend();
# SVC vs SVM
ax2.plot(np.linspace(0, 1.0, 100), np.linspace(0, 1.0, 100), color='black', linestyle='--', lw=1, label='Random')
ax2.plot(*roc_SVC, color='red', lw=1, label=f'SVC (area = {roc_auc_SVC:0.2f})')
for i, color in enumerate(['black', 'green', 'blue']):
ax2.plot(*roc_SVMs[i], color=color, lw=1, label=f'SVM: g={gammas[i]:1.0e} (area = {roc_auc_SVMs[i]:0.2f})')
ax2.legend();
for ax in [ax1, ax2]:
ax.set_ylabel('True positive rate')
ax.set_xlabel('False positive rate')
# generate random data
X = np.random.normal(size=(20,2))
y = np.concatenate([-1*np.ones((10,)), 1*np.ones((10,))])
# non linearly separable case:
X[y==1] += 1
svc = SVC(kernel='linear', C=100)
svc.fit(X, y)
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
ax = classification_helper.plot_classification(svc, X, (y+1)/2)
support_vecs = svc.support_vectors_
ax.scatter(support_vecs[:, 0], support_vecs[:, 1], marker='x', c=['b' if val==-1 else 'r' for val in y[svc.support_]], label='Support vectors')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[-1:], labels[-1:])
plt.show()
len(support_vecs)
15
# Set up possible values of parameters to optimize over
p_grid = {'C': np.logspace(-2, 1, 50)}
clf = SVC(kernel='linear')
# Parameter search and scoring
grid_search = GridSearchCV(estimator=clf, param_grid=p_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X, y)
grid_search.best_params_
{'C': 0.01}
grid_search.best_score_
0.7
ax = classification_helper.plot_classification(grid_search, X, (y+1)/2)
support_vecs = grid_search.best_estimator_.support_vectors_
ax.scatter(support_vecs[:, 0], support_vecs[:, 1], marker='x', c=['b' if val==-1 else 'r' for val in y[grid_search.best_estimator_.support_]])
handles, labels = ax.get_legend_handles_labels()
classification_helper.print_classification_statistics(grid_search, X, y)
Classification Report: precision recall f1-score support -1.0 0.667 0.800 0.727 10 1.0 0.750 0.600 0.667 10 avg / total 0.708 0.700 0.697 20 Confusion Matrix: Predicted 0 1 Real 0 0.8 0.2 1 0.4 0.6
# non linearly separable case:
X[y==1] += 1
svc = SVC(kernel='linear', C=0.1)
svc.fit(X, y)
ax = classification_helper.plot_classification(svc, X, (y+1)/2)
support_vecs = svc.support_vectors_
ax.scatter(support_vecs[:, 0], support_vecs[:, 1], marker='x', c=['b' if val==-1 else 'r' for val in y[svc.support_]], label='Support vectors')
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[-1:], labels[-1:])
plt.show()
classification_helper.print_classification_statistics(svc, X, y)
Classification Report: precision recall f1-score support -1.0 0.727 0.800 0.762 10 1.0 0.778 0.700 0.737 10 avg / total 0.753 0.750 0.749 20 Confusion Matrix: Predicted 0 1 Real 0 0.8 0.2 1 0.3 0.7
# generate random data, non linearly separable
X = np.random.normal(size=(200,2))
X[:100] += 2
X[100:150] -= 2
y = np.concatenate([1*np.ones((150,)), np.zeros((50,))])
plt.scatter(X[:, 0], X[:, 1], c=['r' if val==1 else 'b' for val in y]);
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100)
p_grid = {'svc__C': np.logspace(-3, 0.1, 50),
'svc__gamma': np.linspace(0.1, 10, 20)}
svm = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
grid_search = GridSearchCV(estimator=svm, param_grid=p_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)
grid_search.best_params_
{'svc__C': 0.3925342450813755, 'svc__gamma': 1.142105263157895}
best_svm = grid_search.best_estimator_.named_steps['svc']
classification_helper.plot_classification(best_svm, X_test, y_test)
classification_helper.print_classification_statistics(best_svm, X_test, y_test)
y_pred = best_svm.predict(X_test)
print('% of miscassified samples: ', 100-accuracy_score(y_test, y_pred)*100, '%')
Classification Report: precision recall f1-score support 0.0 1.000 0.348 0.516 23 1.0 0.837 1.000 0.911 77 avg / total 0.874 0.850 0.820 100 Confusion Matrix: Predicted 0 1 Real 0 0.347826 0.652174 1 0.000000 1.000000 % of miscassified samples: 15.0 %
y_pred_SVM = best_svm.decision_function(X_test)
*roc_SVM, _ = roc_curve(y_test, y_pred_SVM)
roc_auc_SVM = auc(*roc_SVM)
fig, ax = plt.subplots(1, 1, figsize=(5,4))
# LDA vs SVC
ax.plot(np.linspace(0, 1.0, 100), np.linspace(0, 1.0, 100), color='black', linestyle='--', lw=1, label='Random')
ax.plot(*roc_SVM, color='red', lw=1, label=f'SVM (area = {roc_auc_SVM:0.2f})')
ax.set_ylabel('True positive rate')
ax.set_xlabel('False positive rate')
ax.legend();
# add a third class
X = np.concatenate([X, np.random.normal(loc=(0, 2), size=(50,2))])
y = np.concatenate([y, 2*np.ones((50,))])
plt.scatter(X[:, 0], X[:, 1], c=['grey' if val==1 else ('b' if val==0 else 'r') for val in y]);
svm = make_pipeline(StandardScaler(), SVC(kernel='rbf', decision_function_shape='ovo'))
svm.fit(X, y)
classification_helper.plot_classification(svm, X, y)
classification_helper.print_classification_statistics(svm, X, y)
y_pred = svm.predict(X)
print(f'% of miscassified samples: {100-accuracy_score(y, y_pred)*100:.2f}%')
Classification Report: precision recall f1-score support 0.0 0.796 0.780 0.788 50 1.0 0.899 0.953 0.926 150 2.0 0.738 0.620 0.674 50 avg / total 0.846 0.852 0.848 250 Confusion Matrix: Predicted 0 1 2 Real 0 0.78 0.080000 0.140000 1 0.02 0.953333 0.026667 2 0.14 0.240000 0.620000 % of miscassified samples: 14.80%
df_khan_xtrain = pd.read_csv('Data/Khan_xtrain.csv', index_col=0).dropna()
df_khan_xtest = pd.read_csv('Data/Khan_xtest.csv', index_col=0).dropna()
df_khan_ytrain = pd.read_csv('Data/Khan_ytrain.csv', index_col=0).dropna()
df_khan_ytest = pd.read_csv('Data/Khan_ytest.csv', index_col=0).dropna()
svc = SVC(kernel='linear', C=0.1)
svc.fit(df_khan_xtrain, np.ravel(df_khan_ytrain))
SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
len(svc.support_)
54
classification_helper.print_classification_statistics(svc, df_khan_xtrain, df_khan_ytrain)
Classification Report: precision recall f1-score support 1 1.000 1.000 1.000 8 2 1.000 1.000 1.000 23 3 1.000 1.000 1.000 12 4 1.000 1.000 1.000 20 avg / total 1.000 1.000 1.000 63 Confusion Matrix: Predicted 0 1 2 3 Real 0 1.0 0.0 0.0 0.0 1 0.0 1.0 0.0 0.0 2 0.0 0.0 1.0 0.0 3 0.0 0.0 0.0 1.0
classification_helper.print_classification_statistics(svc, df_khan_xtest, df_khan_ytest)
Classification Report: precision recall f1-score support 1 1.000 1.000 1.000 3 2 0.750 1.000 0.857 6 3 1.000 0.667 0.800 6 4 1.000 1.000 1.000 5 avg / total 0.925 0.900 0.897 20 Confusion Matrix: Predicted 0 1 2 3 Real 0 1.0 0.000000 0.000000 0.0 1 0.0 1.000000 0.000000 0.0 2 0.0 0.333333 0.666667 0.0 3 0.0 0.000000 0.000000 1.0