Exercise 2: Example for pandas using the heart.csv data set
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# read the csv Data
df = pd.read_csv('heart.csv')
df
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 63 | 1 | 3 | 145 | 233 | 1 | 0 | 150 | 0 | 2.3 | 0 | 0 | 1 | 1 |
1 | 37 | 1 | 2 | 130 | 250 | 0 | 1 | 187 | 0 | 3.5 | 0 | 0 | 2 | 1 |
2 | 41 | 0 | 1 | 130 | 204 | 0 | 0 | 172 | 0 | 1.4 | 2 | 0 | 2 | 1 |
3 | 56 | 1 | 1 | 120 | 236 | 0 | 1 | 178 | 0 | 0.8 | 2 | 0 | 2 | 1 |
4 | 57 | 0 | 0 | 120 | 354 | 0 | 1 | 163 | 1 | 0.6 | 2 | 0 | 2 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
298 | 57 | 0 | 0 | 140 | 241 | 0 | 1 | 123 | 1 | 0.2 | 1 | 0 | 3 | 0 |
299 | 45 | 1 | 3 | 110 | 264 | 0 | 1 | 132 | 0 | 1.2 | 1 | 0 | 3 | 0 |
300 | 68 | 1 | 0 | 144 | 193 | 1 | 1 | 141 | 0 | 3.4 | 1 | 2 | 3 | 0 |
301 | 57 | 1 | 0 | 130 | 131 | 0 | 1 | 115 | 1 | 1.2 | 1 | 1 | 3 | 0 |
302 | 57 | 0 | 1 | 130 | 236 | 0 | 0 | 174 | 0 | 0.0 | 1 | 1 | 2 | 0 |
303 rows × 14 columns
# What is the number of columns and rows
print(df.columns)
print (df.info())
print(df.dtypes)
Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'], dtype='object') <class 'pandas.core.frame.DataFrame'> RangeIndex: 303 entries, 0 to 302 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 303 non-null int64 1 sex 303 non-null int64 2 cp 303 non-null int64 3 trestbps 303 non-null int64 4 chol 303 non-null int64 5 fbs 303 non-null int64 6 restecg 303 non-null int64 7 thalach 303 non-null int64 8 exang 303 non-null int64 9 oldpeak 303 non-null float64 10 slope 303 non-null int64 11 ca 303 non-null int64 12 thal 303 non-null int64 13 target 303 non-null int64 dtypes: float64(1), int64(13) memory usage: 33.3 KB None age int64 sex int64 cp int64 trestbps int64 chol int64 fbs int64 restecg int64 thalach int64 exang int64 oldpeak float64 slope int64 ca int64 thal int64 target int64 dtype: object
# get first 3 lines
print(df.head(3))
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope \ 0 63 1 3 145 233 1 0 150 0 2.3 0 1 37 1 2 130 250 0 1 187 0 3.5 0 2 41 0 1 130 204 0 0 172 0 1.4 2 ca thal target 0 0 1 1 1 0 2 1 2 0 2 1
#display statistics summary
print(df.describe())
age sex cp trestbps chol fbs \ count 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 mean 54.366337 0.683168 0.966997 131.623762 246.264026 0.148515 std 9.082101 0.466011 1.032052 17.538143 51.830751 0.356198 min 29.000000 0.000000 0.000000 94.000000 126.000000 0.000000 25% 47.500000 0.000000 0.000000 120.000000 211.000000 0.000000 50% 55.000000 1.000000 1.000000 130.000000 240.000000 0.000000 75% 61.000000 1.000000 2.000000 140.000000 274.500000 0.000000 max 77.000000 1.000000 3.000000 200.000000 564.000000 1.000000 restecg thalach exang oldpeak slope ca \ count 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 mean 0.528053 149.646865 0.326733 1.039604 1.399340 0.729373 std 0.525860 22.905161 0.469794 1.161075 0.616226 1.022606 min 0.000000 71.000000 0.000000 0.000000 0.000000 0.000000 25% 0.000000 133.500000 0.000000 0.000000 1.000000 0.000000 50% 1.000000 153.000000 0.000000 0.800000 1.000000 0.000000 75% 1.000000 166.000000 1.000000 1.600000 2.000000 1.000000 max 2.000000 202.000000 1.000000 6.200000 2.000000 4.000000 thal target count 303.000000 303.000000 mean 2.313531 0.544554 std 0.612277 0.498835 min 0.000000 0.000000 25% 2.000000 0.000000 50% 2.000000 1.000000 75% 3.000000 1.000000 max 3.000000 1.000000
#display correlation
print (df.corr())
age sex cp trestbps chol fbs \ age 1.000000 -0.098447 -0.068653 0.279351 0.213678 0.121308 sex -0.098447 1.000000 -0.049353 -0.056769 -0.197912 0.045032 cp -0.068653 -0.049353 1.000000 0.047608 -0.076904 0.094444 trestbps 0.279351 -0.056769 0.047608 1.000000 0.123174 0.177531 chol 0.213678 -0.197912 -0.076904 0.123174 1.000000 0.013294 fbs 0.121308 0.045032 0.094444 0.177531 0.013294 1.000000 restecg -0.116211 -0.058196 0.044421 -0.114103 -0.151040 -0.084189 thalach -0.398522 -0.044020 0.295762 -0.046698 -0.009940 -0.008567 exang 0.096801 0.141664 -0.394280 0.067616 0.067023 0.025665 oldpeak 0.210013 0.096093 -0.149230 0.193216 0.053952 0.005747 slope -0.168814 -0.030711 0.119717 -0.121475 -0.004038 -0.059894 ca 0.276326 0.118261 -0.181053 0.101389 0.070511 0.137979 thal 0.068001 0.210041 -0.161736 0.062210 0.098803 -0.032019 target -0.225439 -0.280937 0.433798 -0.144931 -0.085239 -0.028046 restecg thalach exang oldpeak slope ca \ age -0.116211 -0.398522 0.096801 0.210013 -0.168814 0.276326 sex -0.058196 -0.044020 0.141664 0.096093 -0.030711 0.118261 cp 0.044421 0.295762 -0.394280 -0.149230 0.119717 -0.181053 trestbps -0.114103 -0.046698 0.067616 0.193216 -0.121475 0.101389 chol -0.151040 -0.009940 0.067023 0.053952 -0.004038 0.070511 fbs -0.084189 -0.008567 0.025665 0.005747 -0.059894 0.137979 restecg 1.000000 0.044123 -0.070733 -0.058770 0.093045 -0.072042 thalach 0.044123 1.000000 -0.378812 -0.344187 0.386784 -0.213177 exang -0.070733 -0.378812 1.000000 0.288223 -0.257748 0.115739 oldpeak -0.058770 -0.344187 0.288223 1.000000 -0.577537 0.222682 slope 0.093045 0.386784 -0.257748 -0.577537 1.000000 -0.080155 ca -0.072042 -0.213177 0.115739 0.222682 -0.080155 1.000000 thal -0.011981 -0.096439 0.206754 0.210244 -0.104764 0.151832 target 0.137230 0.421741 -0.436757 -0.430696 0.345877 -0.391724 thal target age 0.068001 -0.225439 sex 0.210041 -0.280937 cp -0.161736 0.433798 trestbps 0.062210 -0.144931 chol 0.098803 -0.085239 fbs -0.032019 -0.028046 restecg -0.011981 0.137230 thalach -0.096439 0.421741 exang 0.206754 -0.436757 oldpeak 0.210244 -0.430696 slope -0.104764 0.345877 ca 0.151832 -0.391724 thal 1.000000 -0.344029 target -0.344029 1.000000
# Print mean values for each column with and without disease
print(df.groupby('target').mean())
age sex cp trestbps chol fbs \ target 0 56.601449 0.826087 0.478261 134.398551 251.086957 0.159420 1 52.496970 0.563636 1.375758 129.303030 242.230303 0.139394 restecg thalach exang oldpeak slope ca thal target 0 0.449275 139.101449 0.550725 1.585507 1.166667 1.166667 2.543478 1 0.593939 158.466667 0.139394 0.583030 1.593939 0.363636 2.121212
# get table with selection on more than 1 column
df1 = df[(df["sex"] == 0) & (df["target"] == 0) ]
print (df1.head(5))
age sex cp trestbps chol fbs restecg thalach exang oldpeak \ 167 62 0 0 140 268 0 0 160 0 3.6 181 65 0 0 150 225 0 0 114 0 1.0 182 61 0 0 130 330 0 0 169 0 0.0 190 51 0 0 130 305 0 1 142 1 1.2 204 62 0 0 160 164 0 0 145 0 6.2 slope ca thal target 167 0 2 2 0 181 1 3 3 0 182 2 0 2 0 190 1 0 3 0 204 0 3 3 0
Plots
# age dirtibution group into male and female (1 = male; 0 = female)
# male
plt.title('age distribution according to Sex')
df[df["sex"] == 1]['age'].plot.hist()
print(df[df["sex"] > 0]['age'])
# female
df[df["sex"] == 0]['age'].plot.hist()
plt.xlabel('age [years]')
plt.legend(["male", "female"])
0 63 1 37 3 56 5 57 7 44 .. 295 63 297 59 299 45 300 68 301 57 Name: age, Length: 207, dtype: int64
<matplotlib.legend.Legend at 0x12a59a1f0>
plt.figure()
# Plot maximum heart rate
# Heart disease (0 = no, 1 = yes)
plt.title('maximum heart rate according to heart disease')
df[df["target"] == 1]['thalach'].plot.hist()
# no disease
df[df["target"] == 0]['thalach'].plot.hist()
plt.legend(["disease", "no disease"])
plt.xlabel('max heart rate')
Text(0.5, 0, 'max heart rate')
# Plot sex and target in one histogramm via crosstab
pd.crosstab(df.sex,df.target).plot(kind="bar",color=['red','blue' ])
plt.title('Heart Disease distribution according to Sex')
plt.xlabel('Sex (0 = Female, 1 = Male)')
plt.legend(["no disease", "disease"])
<matplotlib.legend.Legend at 0x12addbdf0>
# Plot target and cp in one histogramm via crosstab
pd.crosstab(df.cp,df.target).plot(kind="bar",figsize=(15,6),color=['#11A5AA','#AA1190' ])
plt.title('Heart Disease Distribution According To Chest Pain Type')
plt.xlabel('Chest Pain Type')
plt.xticks(rotation = 0)
plt.ylabel('Frequency of Disease or Not')
Text(0, 0.5, 'Frequency of Disease or Not')
# plot correlations for target
plt.figure()
plt.scatter(x=df.age[df.target==1], y=df.thalach[(df.target==1)], c="red")
plt.scatter(x=df.age[df.target==0], y=df.thalach[(df.target==0)])
plt.title('Age-max Heart Rate Plot')
plt.xlabel('age[years]')
plt.ylabel('max. heart rate')
plt.legend(["Disease", "No Disease"])
<matplotlib.legend.Legend at 0x12af77520>
plt.figure()
plt.scatter(x=df.age[df.target==1], y=df.chol[(df.target==1)], c="red")
plt.scatter(x=df.age[df.target==0], y=df.chol[(df.target==0)])
plt.title('Age-Cholesterol Plot')
plt.xlabel('age[years]')
plt.ylabel('Cholesterol')
plt.legend(["Disease", "No Disease"])
<matplotlib.legend.Legend at 0x12af0f070>