In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib notebook

Data Preprocessing

In [2]:
# Reading the data 
data = pd.read_csv('https://www.openml.org/data/get_csv/1592290/phpgNaXZe')
In [3]:
# Setting up the column
column = ['sbp','tobacco','ldl','adiposity','famhist','type','obesity','alcohol','age','chd']
In [4]:
data.columns = column
In [5]:
data.head()
Out[5]:
sbp tobacco ldl adiposity famhist type obesity alcohol age chd
0 160 12.00 5.73 23.11 1 49 25.30 97.20 52 2
1 144 0.01 4.41 28.61 2 55 28.87 2.06 63 2
2 118 0.08 3.48 32.28 1 52 29.14 3.81 46 1
3 170 7.50 6.41 38.03 1 51 31.99 24.26 58 2
4 134 13.60 3.50 27.78 1 60 25.99 57.34 49 2
In [6]:
data.describe()
Out[6]:
sbp tobacco ldl adiposity famhist type obesity alcohol age chd
count 462.000000 462.000000 462.000000 462.000000 462.000000 462.000000 462.000000 462.000000 462.000000 462.000000
mean 138.326840 3.635649 4.740325 25.406732 1.584416 53.103896 26.044113 17.044394 42.816017 1.346320
std 20.496317 4.593024 2.070909 7.780699 0.493357 9.817534 4.213680 24.481059 14.608956 0.476313
min 101.000000 0.000000 0.980000 6.740000 1.000000 13.000000 14.700000 0.000000 15.000000 1.000000
25% 124.000000 0.052500 3.282500 19.775000 1.000000 47.000000 22.985000 0.510000 31.000000 1.000000
50% 134.000000 2.000000 4.340000 26.115000 2.000000 53.000000 25.805000 7.510000 45.000000 1.000000
75% 148.000000 5.500000 5.790000 31.227500 2.000000 60.000000 28.497500 23.892500 55.000000 2.000000
max 218.000000 31.200000 15.330000 42.490000 2.000000 78.000000 46.580000 147.190000 64.000000 2.000000

Describes the statistical data insights

In [7]:
# Checking for any missing values 
data.isnull().sum()
Out[7]:
sbp          0
tobacco      0
ldl          0
adiposity    0
famhist      0
type         0
obesity      0
alcohol      0
age          0
chd          0
dtype: int64

The values shown are 0, therefore no missing values in the DataFrame

In [8]:
# Feature Scaling, making categorical data precise 
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['famhist']=encoder.fit_transform(data['famhist'])
data['chd']=encoder.fit_transform(data['chd'])
In [9]:
data.head()
Out[9]:
sbp tobacco ldl adiposity famhist type obesity alcohol age chd
0 160 12.00 5.73 23.11 0 49 25.30 97.20 52 1
1 144 0.01 4.41 28.61 1 55 28.87 2.06 63 1
2 118 0.08 3.48 32.28 0 52 29.14 3.81 46 0
3 170 7.50 6.41 38.03 0 51 31.99 24.26 58 1
4 134 13.60 3.50 27.78 0 60 25.99 57.34 49 1
In [10]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler(feature_range =(0,100))
In [11]:
# setting scale of max min value for sbp in range of 0-100, normalise
data['sbp'] = scale.fit_transform(data['sbp'].values.reshape(-1,1))
In [12]:
data.head()
Out[12]:
sbp tobacco ldl adiposity famhist type obesity alcohol age chd
0 50.427350 12.00 5.73 23.11 0 49 25.30 97.20 52 1
1 36.752137 0.01 4.41 28.61 1 55 28.87 2.06 63 1
2 14.529915 0.08 3.48 32.28 0 52 29.14 3.81 46 0
3 58.974359 7.50 6.41 38.03 0 51 31.99 24.26 58 1
4 28.205128 13.60 3.50 27.78 0 60 25.99 57.34 49 1
In [13]:
# Data after modification
data.describe()
Out[13]:
sbp tobacco ldl adiposity famhist type obesity alcohol age chd
count 462.000000 462.000000 462.000000 462.000000 462.000000 462.000000 462.000000 462.000000 462.000000 462.000000
mean 31.903282 3.635649 4.740325 25.406732 0.584416 53.103896 26.044113 17.044394 42.816017 0.346320
std 17.518220 4.593024 2.070909 7.780699 0.493357 9.817534 4.213680 24.481059 14.608956 0.476313
min 0.000000 0.000000 0.980000 6.740000 0.000000 13.000000 14.700000 0.000000 15.000000 0.000000
25% 19.658120 0.052500 3.282500 19.775000 0.000000 47.000000 22.985000 0.510000 31.000000 0.000000
50% 28.205128 2.000000 4.340000 26.115000 1.000000 53.000000 25.805000 7.510000 45.000000 0.000000
75% 40.170940 5.500000 5.790000 31.227500 1.000000 60.000000 28.497500 23.892500 55.000000 1.000000
max 100.000000 31.200000 15.330000 42.490000 1.000000 78.000000 46.580000 147.190000 64.000000 1.000000

The variation of values across the DataFrame for first 50 values

Data Visualization

In [86]:
data.head(50).plot(kind='area',figsize=(10,5));

Distribution of Obesity according to the age

In [85]:
data.plot(x='age',y='obesity',kind='scatter',figsize =(10,5),c='blue');

Distribution of Tobacco consumption across age

In [82]:
data.plot(x='age',y='tobacco',kind='scatter',figsize =(10,5),c='blue');

Distribution of Alcohol consumption across age

In [83]:
data.plot(x='age',y='alcohol',kind='scatter',figsize =(10,5),c='blue');