To find out which factor -> age,trestbps,thalach¶

influences more on patient having heart disease or not using DecisionTree¶

In [4]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

dataset = pd.read_csv("heart1.csv")

In [5]:

dataset.head()

Out[5]:

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1
1	37	1	2	130	250	0	1	187	0	3.5	0	1
2	41	0	1	130	204	0	0	172	0	1.4	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	1

In [6]:

dataset.target.value_counts()

Out[6]:

1    165
0    138
Name: target, dtype: int64

In [7]:

age = dataset[['age']]
trestbps = dataset[['trestbps']]
thalach = dataset[['thalach']]
target =dataset[['target']]

In [17]:

dataset.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 13 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
target      303 non-null int64
dtypes: float64(1), int64(12)
memory usage: 30.9 KB

In [8]:

from sklearn.model_selection import train_test_split
x_train,x_test, y_train,y_test = train_test_split(age,target,test_size=0.30)

In [9]:

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(x_train,y_train)

Out[9]:

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [10]:

y_pred = classifier.predict(x_test)
z= pd.DataFrame(y_pred)
pp=x_test.merge(z,how='inner',left_index=True, right_index=True)
pp.head()

Out[10]:

	age	0
50	51	1
31	65	0
88	54	1
72	29	0
27	51	1

In [11]:

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))

[[17 16]
 [27 31]]

In [12]:

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

0.5274725274725275

In [13]:

x_data=[]
for x_data in trestbps,thalach:
    x_train,x_test, y_train,y_test = train_test_split(x_data,target,test_size=0.30)
    classifier = DecisionTreeClassifier()
    classifier.fit(x_train,y_train)
    y_pred = classifier.predict(x_test)
    print(confusion_matrix(y_test,y_pred))
    print(accuracy_score(y_test,y_pred))
    
    
 

[[14 31]
 [ 9 37]]
0.5604395604395604
[[23 12]
 [26 30]]
0.5824175824175825

To find out which factor -> cp,chol,ca¶

influences more on patient having heart disease using Naive Bayes¶

In [1]:

import pandas as pd
import numpy as np

data =pd.read_csv('heart1.csv')
data.head()

Out[1]:

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1
1	37	1	2	130	250	0	1	187	0	3.5	0	1
2	41	0	1	130	204	0	0	172	0	1.4	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	1

In [17]:

data.loc[:,['cp','chol','ca']].dtypes

Out[17]:

cp      int64
chol    int64
ca      int64
dtype: object

In [19]:

print(data.cp.unique())
print(data.chol.unique())
print(data.ca.unique())

[3 2 1 0]
[233 250 204 236 354 192 294 263 199 168 239 275 266 211 283 219 340 226
 247 234 243 302 212 175 417 197 198 177 273 213 304 232 269 360 308 245
 208 264 321 325 235 257 216 256 231 141 252 201 222 260 182 303 265 309
 186 203 183 220 209 258 227 261 221 205 240 318 298 564 277 214 248 255
 207 223 288 160 394 315 246 244 270 195 196 254 126 313 262 215 193 271
 268 267 210 295 306 178 242 180 228 149 278 253 342 157 286 229 284 224
 206 167 230 335 276 353 225 330 290 172 305 188 282 185 326 274 164 307
 249 341 407 217 174 281 289 322 299 300 293 184 409 259 200 327 237 218
 319 166 311 169 187 176 241 131]
[0 2 1 3 4]

In [27]:

cp =data[['cp']]
chol =data[['chol']]
ca=data[['ca']]
Y=data[['target']]

In [35]:

from sklearn.naive_bayes import BernoulliNB 
#using Bernoulli style of NB 
from sklearn.model_selection import train_test_split

x_data=[]
for x_data in cp,chol,ca:
    X_train, X_test, Y_train, Y_test = train_test_split(cp, Y, test_size=0.25)

    bnb = BernoulliNB(binarize=0.0)
    bnb.fit(X_train, Y_train)
    print(bnb.score(X_test, Y_test))

0.7894736842105263
0.8026315789473685
0.7368421052631579

D:\softwares\Anaconda\lib\site-packages\sklearn\utils\validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
D:\softwares\Anaconda\lib\site-packages\sklearn\utils\validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
D:\softwares\Anaconda\lib\site-packages\sklearn\utils\validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

In [46]:

#predicting target based on model build on y:target, x:ca
datas = np.array([4]).reshape(-1,1)
print(bnb.predict(datas))

[1]

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1
1	37	1	2	130	250	0	1	187	0	3.5	0	1
2	41	0	1	130	204	0	0	172	0	1.4	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1
1	37	1	2	130	250	0	1	187	0	3.5	0	1
2	41	0	1	130	204	0	0	172	0	1.4	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1
1	37	1	2	130	250	0	1	187	0	3.5	0	1
2	41	0	1	130	204	0	0	172	0	1.4	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1
1	37	1	2	130	250	0	1	187	0	3.5	0	1
2	41	0	1	130	204	0	0	172	0	1.4	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1
1	37	1	2	130	250	0	1	187	0	3.5	0	1
2	41	0	1	130	204	0	0	172	0	1.4	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	1

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1
1	37	1	2	130	250	0	1	187	0	3.5	0	1
2	41	0	1	130	204	0	0	172	0	1.4	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	1