import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
filename = "https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/data/heart.csv"
df = pd.read_csv(filename)
df
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 63 | 1 | 3 | 145 | 233 | 1 | 0 | 150 | 0 | 2.3 | 0 | 0 | 1 | 1 |
1 | 37 | 1 | 2 | 130 | 250 | 0 | 1 | 187 | 0 | 3.5 | 0 | 0 | 2 | 1 |
2 | 41 | 0 | 1 | 130 | 204 | 0 | 0 | 172 | 0 | 1.4 | 2 | 0 | 2 | 1 |
3 | 56 | 1 | 1 | 120 | 236 | 0 | 1 | 178 | 0 | 0.8 | 2 | 0 | 2 | 1 |
4 | 57 | 0 | 0 | 120 | 354 | 0 | 1 | 163 | 1 | 0.6 | 2 | 0 | 2 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
298 | 57 | 0 | 0 | 140 | 241 | 0 | 1 | 123 | 1 | 0.2 | 1 | 0 | 3 | 0 |
299 | 45 | 1 | 3 | 110 | 264 | 0 | 1 | 132 | 0 | 1.2 | 1 | 0 | 3 | 0 |
300 | 68 | 1 | 0 | 144 | 193 | 1 | 1 | 141 | 0 | 3.4 | 1 | 2 | 3 | 0 |
301 | 57 | 1 | 0 | 130 | 131 | 0 | 1 | 115 | 1 | 1.2 | 1 | 1 | 3 | 0 |
302 | 57 | 0 | 1 | 130 | 236 | 0 | 0 | 174 | 0 | 0.0 | 1 | 1 | 2 | 0 |
303 rows × 14 columns
y = df['target'].values
X = df[[col for col in df.columns if col!="target"]]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
lr = LogisticRegression(penalty='none', fit_intercept=True, max_iter=5000, tol=1E-5)
rf = RandomForestClassifier(max_depth=3)
ab = AdaBoostClassifier()
gb = GradientBoostingClassifier()
classifiers = [lr, rf, ab, gb]
# train models
for clf in classifiers:
clf.fit(X_train, y_train)
from sklearn.metrics import roc_curve, roc_auc_score
for clf in classifiers:
y_pred_prob = clf.predict_proba(X_test) # predicted probabilities
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_prob[:,1])
plt.plot(tpr_lr, 1-fpr_lr, label=clf.__class__.__name__)
auc = roc_auc_score(y_test, y_pred_prob[:,1])
print(clf.__class__.__name__, f'AUC = {auc}')
plt.xlabel('Recall', fontsize=18)
plt.ylabel('Precision', fontsize=18);
plt.legend(fontsize=15)
LogisticRegression AUC = 0.8872549019607843 RandomForestClassifier AUC = 0.9033613445378151 AdaBoostClassifier AUC = 0.8300070028011205 GradientBoostingClassifier AUC = 0.8940826330532212
<matplotlib.legend.Legend at 0x7fdede7e1d68>
aucs = {}
for clf in classifiers:
aucs[clf.__class__.__name__] = []
# reshuffle the data 1000 times, train classifiers and save AUCs
for i in range(1000):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True)
for clf in classifiers:
clf.fit(X_train, y_train)
y_pred_prob = clf.predict_proba(X_test)
auc = roc_auc_score(y_test, y_pred_prob[:,1])
aucs[clf.__class__.__name__].append(auc)
for clf in classifiers:
clfname = clf.__class__.__name__
plt.hist(aucs[clfname], bins=40, range=(0.7, 1),
histtype='step', linewidth=2, label=clfname);
plt.legend(loc='upper left')