# Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib notebook
# Reading the data
data = pd.read_csv('https://www.openml.org/data/get_csv/1592290/phpgNaXZe')
# Setting up the column
column = ['sbp','tobacco','ldl','adiposity','famhist','type','obesity','alcohol','age','chd']
data.columns = column
data.head()
data.describe()
Describes the statistical data insights
# Checking for any missing values
data.isnull().sum()
The values shown are 0, therefore no missing values in the DataFrame
# Feature Scaling, making categorical data precise
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['famhist']=encoder.fit_transform(data['famhist'])
data['chd']=encoder.fit_transform(data['chd'])
data.head()
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler(feature_range =(0,100))
# setting scale of max min value for sbp in range of 0-100, normalise
data['sbp'] = scale.fit_transform(data['sbp'].values.reshape(-1,1))
data.head()
# Data after modification
data.describe()
The variation of values across the DataFrame for first 50 values
data.head(50).plot(kind='area',figsize=(10,5));
Distribution of Obesity according to the age
data.plot(x='age',y='obesity',kind='scatter',figsize =(10,5),c='blue');
Distribution of Tobacco consumption across age
data.plot(x='age',y='tobacco',kind='scatter',figsize =(10,5),c='blue');
Distribution of Alcohol consumption across age
data.plot(x='age',y='alcohol',kind='scatter',figsize =(10,5),c='blue');