HEART FAILURE PREDICTION
In this notebook we evaluate several variables to determine how they may relate to whether a patient dies or survives a heart failure event.
The variables that are included in this data set are:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import missingno as msno
# Plotly Packages
from plotly import tools
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
from plotly.subplots import make_subplots
init_notebook_mode(connected=True)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv
df = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
df.head()
age | anaemia | creatinine_phosphokinase | diabetes | ejection_fraction | high_blood_pressure | platelets | serum_creatinine | serum_sodium | sex | smoking | time | DEATH_EVENT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 75.0 | 0 | 582 | 0 | 20 | 1 | 265000.00 | 1.9 | 130 | 1 | 0 | 4 | 1 |
1 | 55.0 | 0 | 7861 | 0 | 38 | 0 | 263358.03 | 1.1 | 136 | 1 | 0 | 6 | 1 |
2 | 65.0 | 0 | 146 | 0 | 20 | 0 | 162000.00 | 1.3 | 129 | 1 | 1 | 7 | 1 |
3 | 50.0 | 1 | 111 | 0 | 20 | 0 | 210000.00 | 1.9 | 137 | 1 | 0 | 7 | 1 |
4 | 65.0 | 1 | 160 | 1 | 20 | 0 | 327000.00 | 2.7 | 116 | 0 | 0 | 8 | 1 |
df.columns
Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time', 'DEATH_EVENT'], dtype='object')
# CHANGE SPELLING OF ANAEMIA COLUMN TO 'ANEMIA'
df.rename(columns={'anaemia':'anemia'}, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 299 entries, 0 to 298 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 299 non-null float64 1 anemia 299 non-null int64 2 creatinine_phosphokinase 299 non-null int64 3 diabetes 299 non-null int64 4 ejection_fraction 299 non-null int64 5 high_blood_pressure 299 non-null int64 6 platelets 299 non-null float64 7 serum_creatinine 299 non-null float64 8 serum_sodium 299 non-null int64 9 sex 299 non-null int64 10 smoking 299 non-null int64 11 time 299 non-null int64 12 DEATH_EVENT 299 non-null int64 dtypes: float64(3), int64(10) memory usage: 30.5 KB
df.describe()
age | anemia | creatinine_phosphokinase | diabetes | ejection_fraction | high_blood_pressure | platelets | serum_creatinine | serum_sodium | sex | smoking | time | DEATH_EVENT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 299.000000 | 299.000000 | 299.000000 | 299.000000 | 299.000000 | 299.000000 | 299.000000 | 299.00000 | 299.000000 | 299.000000 | 299.00000 | 299.000000 | 299.00000 |
mean | 60.833893 | 0.431438 | 581.839465 | 0.418060 | 38.083612 | 0.351171 | 263358.029264 | 1.39388 | 136.625418 | 0.648829 | 0.32107 | 130.260870 | 0.32107 |
std | 11.894809 | 0.496107 | 970.287881 | 0.494067 | 11.834841 | 0.478136 | 97804.236869 | 1.03451 | 4.412477 | 0.478136 | 0.46767 | 77.614208 | 0.46767 |
min | 40.000000 | 0.000000 | 23.000000 | 0.000000 | 14.000000 | 0.000000 | 25100.000000 | 0.50000 | 113.000000 | 0.000000 | 0.00000 | 4.000000 | 0.00000 |
25% | 51.000000 | 0.000000 | 116.500000 | 0.000000 | 30.000000 | 0.000000 | 212500.000000 | 0.90000 | 134.000000 | 0.000000 | 0.00000 | 73.000000 | 0.00000 |
50% | 60.000000 | 0.000000 | 250.000000 | 0.000000 | 38.000000 | 0.000000 | 262000.000000 | 1.10000 | 137.000000 | 1.000000 | 0.00000 | 115.000000 | 0.00000 |
75% | 70.000000 | 1.000000 | 582.000000 | 1.000000 | 45.000000 | 1.000000 | 303500.000000 | 1.40000 | 140.000000 | 1.000000 | 1.00000 | 203.000000 | 1.00000 |
max | 95.000000 | 1.000000 | 7861.000000 | 1.000000 | 80.000000 | 1.000000 | 850000.000000 | 9.40000 | 148.000000 | 1.000000 | 1.00000 | 285.000000 | 1.00000 |
There are no missing values
missing_percentage=df.isna().sum()*100/df.shape[0]
missing_percentage
age 0.0 anemia 0.0 creatinine_phosphokinase 0.0 diabetes 0.0 ejection_fraction 0.0 high_blood_pressure 0.0 platelets 0.0 serum_creatinine 0.0 serum_sodium 0.0 sex 0.0 smoking 0.0 time 0.0 DEATH_EVENT 0.0 dtype: float64
df_survived = df.loc[df['DEATH_EVENT'] == 0]
df_died = df.loc[df['DEATH_EVENT'] == 1]
df_cat = df[['anemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']]
df_cont = df[['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time']]
#PRINT VALUE COUNTS FOR VARIABLES WITH BINARY VALUES:
print("ANEMIA:")
print(df['anemia'].value_counts())
print("")
print("DIABETES:")
print(df['diabetes'].value_counts())
print("")
print("HIGH BLOOD PRESSURE:")
print(df['high_blood_pressure'].value_counts())
print("")
print("SEX:")
print(df['sex'].value_counts())
print("")
print("SMOKING:")
print(df['smoking'].value_counts())
print("")
print("DEATH EVENT:")
print(df['DEATH_EVENT'].value_counts())
print("")
ANEMIA: 0 170 1 129 Name: anemia, dtype: int64 DIABETES: 0 174 1 125 Name: diabetes, dtype: int64 HIGH BLOOD PRESSURE: 0 194 1 105 Name: high_blood_pressure, dtype: int64 SEX: 1 194 0 105 Name: sex, dtype: int64 SMOKING: 0 203 1 96 Name: smoking, dtype: int64 DEATH EVENT: 0 203 1 96 Name: DEATH_EVENT, dtype: int64
print("Range of Age Column: ", df['age'].min(), "to", df['age'].max())
print("")
print("Range of Creatinine Phosphokinase Column: ", df['creatinine_phosphokinase'].min(), "to", df['creatinine_phosphokinase'].max())
print("")
print("Range of Platelets Column: ", df['platelets'].min(), "to", df['platelets'].max())
print("")
print("Range of Serum Creatinine Column: ", df['serum_creatinine'].min(), "to", df['serum_creatinine'].max())
print("")
print("Range of Serum Sodium Column: ", df['serum_sodium'].min(), "to", df['serum_sodium'].max())
print("")
print("Range of Time Column: ", df['time'].min(), "to", df['time'].max())
Range of Age Column: 40.0 to 95.0 Range of Creatinine Phosphokinase Column: 23 to 7861 Range of Platelets Column: 25100.0 to 850000.0 Range of Serum Creatinine Column: 0.5 to 9.4 Range of Serum Sodium Column: 113 to 148 Range of Time Column: 4 to 285
We'll visualize some of the major outliers and delete the rows that contain them. This will improve the accuracy of our predictive models.
fig = px.box(df, x="creatinine_phosphokinase")
fig.update_layout(title_text='CREATININE PHOSPHOKINASE')
fig.show()
fig = px.box(df, x="platelets")
fig.update_layout(title_text='PLATELETS')
fig.show()
fig = px.box(df, x="serum_creatinine")
fig.update_layout(title_text='SERUM CREATININE')
fig.show()
fig = px.box(df, x="serum_sodium")
fig.update_layout(title_text='SERUM SODIUM')
fig.show()
# DROP ROWS WITH OUTLIER VALUES
df.drop(df[df['creatinine_phosphokinase'] >= 1380].index, inplace = True)
df.drop(df[df['platelets'] >= 448000].index, inplace = True)
df.drop(df[df['platelets'] <= 73000].index, inplace = True)
df.drop(df[df['serum_creatinine'] >= 1.7].index, inplace = True)
df.drop(df[df['serum_sodium'] <= 127].index, inplace = True)
df.drop(df[df['serum_sodium'] >= 148].index, inplace = True)
In this section we'll explore our data and create some visualizion to give us further insight.
values = df['DEATH_EVENT'].value_counts()
fig = make_subplots(rows=1, cols=2,
specs=[[{"type": "xy"}, {"type": "domain"}]],
subplot_titles=('Death Event Count', 'Death Event Percentage'))
fig.add_trace(go.Bar(y=values,
name='Death Event Count',
marker=dict(color=['#2ad4cb','#e6c822'])), row=1, col=1)
fig.add_trace(go.Pie(labels=['Survived','Died'],
values=values,
name='Death Event Percentage',
hole = 0.5,
marker=dict(colors=['#2ad4cb','#e6c822'])), row=1, col=2)
fig.update_layout(height=500,
title_text='DEATH EVENT STATS',
showlegend=True)
fig.show()
The dataset is very unbalanced with a relatively low number of deaths.
def plot_cats(feat):
values = df_cat[feat].value_counts()
labels = df_cat[feat].value_counts().keys().tolist()
fig = make_subplots(rows=1, cols=2,
specs=[[{"type": "xy"}, {"type": "domain"}]],
subplot_titles=((feat.title() + ' Count'), (feat.title() + ' Percentage')))
fig.add_trace(go.Bar(y=values,
name=(feat.title() + ' Count'),
marker=dict(color=['#2ad4cb','#e6c822'])), row=1, col=1)
fig.add_trace(go.Pie(labels=[labels],
values=values,
name=feat.title() + ' Percentage',
hole = 0.5,
marker=dict(colors=['#2ad4cb','#e6c822'])), row=1, col=2)
fig.update_layout(height=500,
title_text=feat.upper() + ' STATS',
showlegend=True)
fig.show()
plot_cats('anemia')
plot_cats('diabetes')
plot_cats('high_blood_pressure')
plot_cats('sex')
plot_cats('smoking')
In this section we'll visualize how each categorical and continuous variable correlates with our target variable, "DEATH EVENT".
OBSERVATIONS
df_anemia = df.groupby(['anemia', 'DEATH_EVENT'])[['DEATH_EVENT']].count()
df_anemia.columns = ['count']
df_anemia.reset_index(inplace=True)
anemia_count = df_anemia.groupby(['anemia'])[['count']].sum()
anemia_count.reset_index(inplace=True)
noanemia_death = df_anemia.loc[df_anemia['anemia'] == 0]
anemia_death = df_anemia.loc[df_anemia['anemia'] == 1]
subplot_titles=['ANEMIA COUNT', 'ANEMIA x DEATH EVENT COUNT', 'ANEMIA PERCENTAGES',
'OVERALL ANEMIA & DEATH EVENT', 'NO ANEMIA x DEATH', 'ANEMIA x DEATH']
fig = make_subplots(rows=3, cols=2, specs=[[{"type": "xy"}, {"type": "xy"}],
[{"type": "domain"}, {"type": "domain"}],
[{"type": "domain"}, {"type": "domain"}]],
subplot_titles=subplot_titles,
vertical_spacing = 0.13)
label1 = ['No Anemia', 'Anemia']
label2 = ['No Anemia: Survived', 'No Anemia: Died', 'Anemia: Survived', 'Anemia: Died']
label3 = ['No Anemia: Survived', 'No Anemia: Died']
label4 = ['Anemia: Survived', 'Anemia: Died']
fig.add_trace(go.Bar(x=label1, y=anemia_count['count'], name='Anemia Count', marker_color='rgb(26, 118, 255)'), row=1, col=1)
fig.add_trace(go.Bar(x=label2, y=df_anemia['count'], name='Anemia vs Death Event', marker_color='rgb(235, 186, 40)'), row=1, col=2)
fig.add_trace(go.Pie(labels=label1, values=anemia_count['count']), row=2, col=1)
fig.add_trace(go.Pie(labels=label2, values=df_anemia['count']), row=2, col=2)
fig.add_trace(go.Pie(labels=label3, values=noanemia_death['count']), row=3, col=1)
fig.add_trace(go.Pie(labels=label4, values=anemia_death['count']), row=3, col=2)
# fig.update_traces(hoverinfo="label+name+value")
fig.update_layout(height=1000, showlegend=True, title_text='ANEMIA x DEATH EVENT')
fig.show()
OBSERVATIONS
df_diabetes = df.groupby(['diabetes', 'DEATH_EVENT'])[['DEATH_EVENT']].count()
df_diabetes.columns = ['count']
df_diabetes.reset_index(inplace=True)
diabetes_count = df_diabetes.groupby(['diabetes'])[['count']].sum()
diabetes_count.reset_index(inplace=True)
nodiabetes_death = df_diabetes.loc[df_diabetes['diabetes'] == 0]
diabetes_death = df_diabetes.loc[df_diabetes['diabetes'] == 1]
subplot_titles=['DIABETES COUNT', 'DIABETES x DEATH EVENT COUNT', 'DIABETES PERCENTAGES',
'OVERALL DIABETES & DEATH EVENT', 'NO DIABETES x DEATH', 'DIABETES x DEATH']
fig = make_subplots(rows=3, cols=2, specs=[[{"type": "xy"}, {"type": "xy"}],
[{"type": "domain"}, {"type": "domain"}],
[{"type": "domain"}, {"type": "domain"}]],
subplot_titles=subplot_titles,
vertical_spacing = 0.13)
label1 = ['No Diabetes', 'Diabetes']
label2 = ['No Diabetes: Survived', 'No Diabetes: Died', 'Diabetes: Survived', 'Diabetes: Died']
label3 = ['No Diabetes: Survived', 'No Diabetes: Died']
label4 = ['Diabetes: Survived', 'Diabetes: Died']
fig.add_trace(go.Bar(x=label1, y=diabetes_count['count'], name='Diabetes Count', marker_color='rgb(26, 118, 255)'), row=1, col=1)
fig.add_trace(go.Bar(x=label2, y=df_diabetes['count'], name='Diabetes vs Death Event', marker_color='rgb(235, 186, 40)'), row=1, col=2)
fig.add_trace(go.Pie(labels=label1, values=diabetes_count['count']), row=2, col=1)
fig.add_trace(go.Pie(labels=label2, values=df_diabetes['count']), row=2, col=2)
fig.add_trace(go.Pie(labels=label3, values=nodiabetes_death['count']), row=3, col=1)
fig.add_trace(go.Pie(labels=label4, values=diabetes_death['count']), row=3, col=2)
# fig.update_traces(hoverinfo="label+name+value")
fig.update_layout(height=1000, showlegend=True, title_text='DIABETES x DEATH EVENT')
fig.show()
OBSERVATIONS
df_hbp = df.groupby(['high_blood_pressure', 'DEATH_EVENT'])[['DEATH_EVENT']].count()
df_hbp.columns = ['count']
df_hbp.reset_index(inplace=True)
hbp_count = df_hbp.groupby(['high_blood_pressure'])[['count']].sum()
hbp_count.reset_index(inplace=True)
nohbp_death = df_hbp.loc[df_hbp['high_blood_pressure'] == 0]
hbp_death = df_hbp.loc[df_hbp['high_blood_pressure'] == 1]
subplot_titles=['HBP COUNT', 'HBP x DEATH EVENT COUNT', 'HBP PERCENTAGES',
'OVERALL HBP & DEATH EVENT', 'NO HBP x DEATH', 'HBP x DEATH']
fig = make_subplots(rows=3, cols=2, specs=[[{"type": "xy"}, {"type": "xy"}],
[{"type": "domain"}, {"type": "domain"}],
[{"type": "domain"}, {"type": "domain"}]],
subplot_titles=subplot_titles,
vertical_spacing = 0.13)
label1 = ['No HBP', 'HBP']
label2 = ['No HBP: Survived', 'No HBP: Died', 'HBP: Survived', 'HBP: Died']
label3 = ['No HBP: Survived', 'No HBP: Died']
label4 = ['HBP: Survived', 'HBP: Died']
fig.add_trace(go.Bar(x=label1, y=hbp_count['count'], name='HBP Count', marker_color='rgb(26, 118, 255)'), row=1, col=1)
fig.add_trace(go.Bar(x=label2, y=df_hbp['count'], name='HBP vs Death Event', marker_color='rgb(235, 186, 40)'), row=1, col=2)
fig.add_trace(go.Pie(labels=label1, values=hbp_count['count']), row=2, col=1)
fig.add_trace(go.Pie(labels=label2, values=df_hbp['count']), row=2, col=2)
fig.add_trace(go.Pie(labels=label3, values=nohbp_death['count']), row=3, col=1)
fig.add_trace(go.Pie(labels=label4, values=hbp_death['count']), row=3, col=2)
# fig.update_traces(hoverinfo="label+name+value")
fig.update_layout(height=1000, showlegend=True, title_text='HIGH BLOOD PRESSURE (HBP) x DEATH EVENT')
fig.show()
OBSERVATIONS
df_sex = df.groupby(['sex', 'DEATH_EVENT'])[['DEATH_EVENT']].count()
df_sex.columns = ['count']
df_sex.reset_index(inplace=True)
sex_count = df_sex.groupby(['sex'])[['count']].sum()
sex_count.reset_index(inplace=True)
female_death = df_sex.loc[df_sex['sex'] == 0]
male_death = df_sex.loc[df_sex['sex'] == 1]
subplot_titles=['SEX COUNT', 'SEX x DEATH EVENT COUNT', 'SEX PERCENTAGES',
'OVERALL SEX & DEATH EVENT', 'FEMALE x DEATH', 'MALE x DEATH']
fig = make_subplots(rows=3, cols=2, specs=[[{"type": "xy"}, {"type": "xy"}],
[{"type": "domain"}, {"type": "domain"}],
[{"type": "domain"}, {"type": "domain"}]],
subplot_titles=subplot_titles,
vertical_spacing = 0.13)
label1 = ['Female', 'Male']
label2 = ['Female: Survived', 'Female: Died', 'Male: Survived', 'Male: Died']
label3 = ['Female: Survived', 'Female: Died']
label4 = ['Male: Survived', 'Male: Died']
fig.add_trace(go.Bar(x=label1, y=sex_count['count'], name='Sex Count', marker_color='rgb(26, 118, 255)'), row=1, col=1)
fig.add_trace(go.Bar(x=label2, y=df_sex['count'], name='Sex vs Death Event', marker_color='rgb(235, 186, 40)'), row=1, col=2)
fig.add_trace(go.Pie(labels=label1, values=sex_count['count']), row=2, col=1)
fig.add_trace(go.Pie(labels=label2, values=df_sex['count']), row=2, col=2)
fig.add_trace(go.Pie(labels=label3, values=female_death['count']), row=3, col=1)
fig.add_trace(go.Pie(labels=label4, values=male_death['count']), row=3, col=2)
# fig.update_traces(hoverinfo="label+name+value")
fig.update_layout(height=1000, showlegend=True, title_text='SEX x DEATH EVENT')
fig.show()
OBSERVATIONS:
df_smoking = df.groupby(['smoking', 'DEATH_EVENT'])[['DEATH_EVENT']].count()
df_smoking.columns = ['count']
df_smoking.reset_index(inplace=True)
smoking_count = df_smoking.groupby(['smoking'])[['count']].sum()
smoking_count.reset_index(inplace=True)
nonsmoking_death = df_smoking.loc[df_smoking['smoking'] == 0]
smoking_death = df_smoking.loc[df_smoking['smoking'] == 1]
subplot_titles=['SMOKING COUNT', 'SMOKING x DEATH EVENT COUNT', 'SMOKING PERCENTAGES',
'OVERALL SMOKING & DEATH EVENT', 'NO SMOKING x DEATH', 'SMOKING x DEATH']
fig = make_subplots(rows=3, cols=2, specs=[[{"type": "xy"}, {"type": "xy"}],
[{"type": "domain"}, {"type": "domain"}],
[{"type": "domain"}, {"type": "domain"}]],
subplot_titles=subplot_titles,
vertical_spacing = 0.13)
label1 = ['Non Smoker', 'Smoker']
label2 = ['Non Smoker: Survived', 'Non Smoker: Died', 'Smoker: Survived', 'Smoker: Died']
label3 = ['Non Smoker: Survived', 'Non Smoker: Died']
label4 = ['Smoker: Survived', 'Smoker: Died']
fig.add_trace(go.Bar(x=label1, y=smoking_count['count'], name='Smoker Count', marker_color='rgb(26, 118, 255)'), row=1, col=1)
fig.add_trace(go.Bar(x=label2, y=df_smoking['count'], name='Smoker vs Death Event', marker_color='rgb(235, 186, 40)'), row=1, col=2)
fig.add_trace(go.Pie(labels=label1, values=smoking_count['count']), row=2, col=1)
fig.add_trace(go.Pie(labels=label2, values=df_smoking['count']), row=2, col=2)
fig.add_trace(go.Pie(labels=label3, values=nonsmoking_death['count']), row=3, col=1)
fig.add_trace(go.Pie(labels=label4, values=smoking_death['count']), row=3, col=2)
# fig.update_traces(hoverinfo="label+name+value")
fig.update_layout(height=1000, showlegend=True, title_text='SMOKING x DEATH EVENT')
fig.show()
The charts below will display the count of the various continuous variables ('age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time') according to whether or not a patient died or survived.
fig = px.histogram(df, x="age",
color="DEATH_EVENT",
color_discrete_sequence=['#e6c822','#2ad4cb'],
marginal="box",
nbins=10, hover_data=df.columns)
fig.update_layout(height=500, title_text='AGE x DEATH EVENT', showlegend=True)
fig.show()
fig = px.histogram(df,
x="creatinine_phosphokinase",
color="DEATH_EVENT",
color_discrete_sequence=['#e6c822','#2ad4cb'],
marginal="box",
hover_data=df.columns)
fig.update_layout(height=500, title_text='CREATININE PHOSPHOKINASE x DEATH EVENT', showlegend=True)
fig.show()
fig = px.histogram(df,
x="ejection_fraction",
color="DEATH_EVENT",
color_discrete_sequence=['#e6c822','#2ad4cb'],
marginal="box",
hover_data=df.columns)
fig.update_layout(height=500, title_text='EJECTION FRACTION x DEATH EVENT', showlegend=True)
fig.show()
fig = px.histogram(df, x="platelets",
color="DEATH_EVENT",
color_discrete_sequence=['#e6c822','#2ad4cb'],
marginal="box",
hover_data=df.columns)
fig.update_layout(height=500, title_text='PLATELETS x DEATH EVENT', showlegend=True)
fig.show()
fig = px.histogram(df,
x="serum_creatinine",
color="DEATH_EVENT",
color_discrete_sequence=['#e6c822','#2ad4cb'],
marginal="box",
hover_data=df.columns)
fig.update_layout(height=500, title_text='SERUM CREATININE x DEATH EVENT', showlegend=True)
fig.show()
fig = px.histogram(df,
x="serum_sodium",
color="DEATH_EVENT",
color_discrete_sequence=['#e6c822','#2ad4cb'],
marginal="box",
hover_data=df.columns)
fig.update_layout(height=500, title_text='SERUM SODIUM x DEATH EVENT', showlegend=True)
fig.show()
fig = px.histogram(df,
x="time",
color="DEATH_EVENT",
color_discrete_sequence=['#e6c822','#2ad4cb'],
marginal="box",
hover_data=df.columns)
fig.update_layout(height=500, title_text='TIME x DEATH EVENT', showlegend=True)
fig.show()
df_de = df.groupby(['DEATH_EVENT', 'sex', 'high_blood_pressure'])[['age']].count()
df_de.reset_index(inplace=True)
df_de.rename(columns={'age':'count'}, inplace=True)
survived_female = df_de.loc[(df_de['DEATH_EVENT'] == 0) & (df_de['sex'] == 0)]
survived_male = df_de.loc[(df_de['DEATH_EVENT'] == 0) & (df_de['sex'] == 1)]
died_female = df_de.loc[(df_de['DEATH_EVENT'] == 1) & (df_de['sex'] == 0)]
died_male = df_de.loc[(df_de['DEATH_EVENT'] == 1) & (df_de['sex'] == 1)]
subplot_titles = ['FEMALE x SURVIVED' , 'MALE x SURVIVED', 'FEMALE x DIED', 'MALE x DIED']
fig = make_subplots(rows=2, cols=2, specs=[[{"type": "domain"}, {"type": "domain"}],
[{"type": "domain"}, {"type": "domain"}]],
subplot_titles=subplot_titles,
vertical_spacing = 0.10)
label = ['No HBP', 'HBP']
fig.add_trace(go.Pie(labels=label, values=survived_female['count'], hole=0.5, marker=dict(colors=['#2ad4cb','#e6c822'])), row=1, col=1)
fig.add_trace(go.Pie(labels=label, values=survived_male['count'], hole=0.5, marker=dict(colors=['#2ad4cb','#e6c822'])), row=1, col=2)
fig.add_trace(go.Pie(labels=label, values=died_female['count'], hole=0.5, marker=dict(colors=['#2ad4cb','#e6c822'])), row=2, col=1)
fig.add_trace(go.Pie(labels=label, values=died_male['count'], hole=0.5, marker=dict(colors=['#2ad4cb','#e6c822'])), row=2, col=2)
# fig.update_traces(hoverinfo="label+name+value")
fig.update_layout(height=750, showlegend=True, title_text='DEATH & SURVIVAL PERCENTAGES BY SEX & HIGH BLOOD PRESSURE')
fig.show()
df_de2 = df.groupby(['DEATH_EVENT', 'sex', 'smoking'])[['age']].count()
df_de2.reset_index(inplace=True)
df_de2.rename(columns={'age':'count'}, inplace=True)
survived_female = df_de2.loc[(df_de2['DEATH_EVENT'] == 0) & (df_de2['sex'] == 0)]
survived_male = df_de2.loc[(df_de2['DEATH_EVENT'] == 0) & (df_de2['sex'] == 1)]
died_female = df_de2.loc[(df_de2['DEATH_EVENT'] == 1) & (df_de2['sex'] == 0)]
died_male = df_de2.loc[(df_de2['DEATH_EVENT'] == 1) & (df_de2['sex'] == 1)]
subplot_titles = ['FEMALE x SURVIVED' , 'MALE x SURVIVED', 'FEMALE x DIED', 'MALE x DIED']
fig = make_subplots(rows=2, cols=2, specs=[[{"type": "domain"}, {"type": "domain"}],
[{"type": "domain"}, {"type": "domain"}]],
subplot_titles=subplot_titles,
vertical_spacing = 0.10)
label = ['Non-Smoking', 'Smoking']
fig.add_trace(go.Pie(labels=label, values=survived_female['count'], hole=0.5, marker=dict(colors=['#2ad4cb','#e6c822']), rotation=-45), row=1, col=1)
fig.add_trace(go.Pie(labels=label, values=survived_male['count'], hole=0.5, marker=dict(colors=['#2ad4cb','#e6c822'])), row=1, col=2)
fig.add_trace(go.Pie(labels=label, values=died_female['count'], hole=0.5, marker=dict(colors=['#2ad4cb','#e6c822'])), row=2, col=1)
fig.add_trace(go.Pie(labels=label, values=died_male['count'], hole=0.5, marker=dict(colors=['#2ad4cb','#e6c822'])), row=2, col=2)
fig.update_traces(hoverinfo="label+name+value")
fig.update_layout(height=700,
showlegend=True,
margin=dict(t=100, b=0, l=0, r=0),
title_text='DEATH & SURVIVAL PERCENTAGES BY SEX & SMOKING STATUS')
fig.show()
# df_de_age = df.groupby(['DEATH_EVENT', 'sex', 'age'])[['anemia']].count()
# df_de_age.reset_index(inplace=True)
# df_de_age.rename(columns={'anemia':'count'}, inplace=True)
# df_de_age = df_de_age.loc[df_de_age['DEATH_EVENT'] == 1]
# df_de_age
df_de1_age = df[['DEATH_EVENT', 'sex', 'age']].loc[df['DEATH_EVENT'] == 1]
df_de1_age
fig = px.histogram(df_de1_age,
x="age",
color="sex",
marginal="box",
color_discrete_sequence=['#e6c822','#2ad4cb'],
hover_data=df_de1_age.columns)
fig.update_layout(height=500, title_text='SEX & AGE x DEATH EVENT', showlegend=True)
fig.show()
df_de0_age = df[['DEATH_EVENT', 'sex', 'age']].loc[df['DEATH_EVENT'] == 0]
df_de0_age
fig = px.histogram(df_de0_age,
x="age",
color="sex",
marginal="box",
color_discrete_sequence=['#e6c822','#2ad4cb'],
hover_data=df_de0_age.columns)
fig.update_layout(height=500, title_text='SEX & AGE of the Survivors', showlegend=True)
fig.show()