Logistic regression with scikit-learn: heart disease data set¶

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Read data¶

In [2]:

filename = "https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/data/heart.csv"
df = pd.read_csv(filename)
df

Out[2]:

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	0	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	0	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	0	2	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
298	57	0	0	140	241	0	1	123	1	0.2	1	0	3	0
299	45	1	3	110	264	0	1	132	0	1.2	1	0	3	0
300	68	1	0	144	193	1	1	141	0	3.4	1	2	3	0
301	57	1	0	130	131	0	1	115	1	1.2	1	1	3	0
302	57	0	1	130	236	0	0	174	0	0.0	1	1	2	0

303 rows × 14 columns

In [3]:

y = df['target'].values
X = df[[col for col in df.columns if col!="target"]]

In [4]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True)

Fit the model¶

In [5]:

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

lr = LogisticRegression(penalty='none', fit_intercept=True, max_iter=5000, tol=1E-5)
rf = RandomForestClassifier(max_depth=3)
ab = AdaBoostClassifier()
gb = GradientBoostingClassifier()

classifiers = [lr, rf, ab, gb]

In [6]:

# train models
for clf in classifiers:
    clf.fit(X_train, y_train)

Compare two classifiers using the ROC curve¶

In [7]:

from sklearn.metrics import roc_curve, roc_auc_score

for clf in classifiers:
    
    y_pred_prob = clf.predict_proba(X_test) # predicted probabilities
    fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_prob[:,1])
    plt.plot(tpr_lr, 1-fpr_lr, label=clf.__class__.__name__)
    auc = roc_auc_score(y_test, y_pred_prob[:,1])
    print(clf.__class__.__name__, f'AUC = {auc}')

plt.xlabel('Recall', fontsize=18)
plt.ylabel('Precision', fontsize=18);
plt.legend(fontsize=15)

LogisticRegression AUC = 0.8872549019607843
RandomForestClassifier AUC = 0.9033613445378151
AdaBoostClassifier AUC = 0.8300070028011205
GradientBoostingClassifier AUC = 0.8940826330532212

Out[7]:

<matplotlib.legend.Legend at 0x7fdede7e1d68>

Bonus: Check if the performance differences are just statistical fluctuations¶

In [ ]:

aucs = {}
for clf in classifiers:
    aucs[clf.__class__.__name__] = []

# reshuffle the data 1000 times, train classifiers and save AUCs
for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True)
    for clf in classifiers:
        clf.fit(X_train, y_train)
        y_pred_prob = clf.predict_proba(X_test)
        auc = roc_auc_score(y_test, y_pred_prob[:,1])
        aucs[clf.__class__.__name__].append(auc)

In [ ]:

for clf in classifiers:
    clfname = clf.__class__.__name__
    plt.hist(aucs[clfname], bins=40, range=(0.7, 1), 
             histtype='step', linewidth=2, label=clfname);
plt.legend(loc='upper left')

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	0	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	0	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	0	2	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
298	57	0	0	140	241	0	1	123	1	0.2	1	0	3	0
299	45	1	3	110	264	0	1	132	0	1.2	1	0	3	0
300	68	1	0	144	193	1	1	141	0	3.4	1	2	3	0
301	57	1	0	130	131	0	1	115	1	1.2	1	1	3	0
302	57	0	1	130	236	0	0	174	0	0.0	1	1	2	0

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	0	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	0	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	0	2	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
298	57	0	0	140	241	0	1	123	1	0.2	1	0	3	0
299	45	1	3	110	264	0	1	132	0	1.2	1	0	3	0
300	68	1	0	144	193	1	1	141	0	3.4	1	2	3	0
301	57	1	0	130	131	0	1	115	1	1.2	1	1	3	0
302	57	0	1	130	236	0	0	174	0	0.0	1	1	2	0

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	0	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	0	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	0	2	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
298	57	0	0	140	241	0	1	123	1	0.2	1	0	3	0
299	45	1	3	110	264	0	1	132	0	1.2	1	0	3	0
300	68	1	0	144	193	1	1	141	0	3.4	1	2	3	0
301	57	1	0	130	131	0	1	115	1	1.2	1	1	3	0
302	57	0	1	130	236	0	0	174	0	0.0	1	1	2	0