import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
data=pd.read_csv('Diabetes.csv')
data.head()
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
data.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Pregnancies | 768.0 | 3.845052 | 3.369578 | 0.000 | 1.00000 | 3.0000 | 6.00000 | 17.00 |
Glucose | 768.0 | 120.894531 | 31.972618 | 0.000 | 99.00000 | 117.0000 | 140.25000 | 199.00 |
BloodPressure | 768.0 | 69.105469 | 19.355807 | 0.000 | 62.00000 | 72.0000 | 80.00000 | 122.00 |
SkinThickness | 768.0 | 20.536458 | 15.952218 | 0.000 | 0.00000 | 23.0000 | 32.00000 | 99.00 |
Insulin | 768.0 | 79.799479 | 115.244002 | 0.000 | 0.00000 | 30.5000 | 127.25000 | 846.00 |
BMI | 768.0 | 31.992578 | 7.884160 | 0.000 | 27.30000 | 32.0000 | 36.60000 | 67.10 |
DiabetesPedigreeFunction | 768.0 | 0.471876 | 0.331329 | 0.078 | 0.24375 | 0.3725 | 0.62625 | 2.42 |
Age | 768.0 | 33.240885 | 11.760232 | 21.000 | 24.00000 | 29.0000 | 41.00000 | 81.00 |
Outcome | 768.0 | 0.348958 | 0.476951 | 0.000 | 0.00000 | 0.0000 | 1.00000 | 1.00 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 768 non-null int64 1 Glucose 768 non-null int64 2 BloodPressure 768 non-null int64 3 SkinThickness 768 non-null int64 4 Insulin 768 non-null int64 5 BMI 768 non-null float64 6 DiabetesPedigreeFunction 768 non-null float64 7 Age 768 non-null int64 8 Outcome 768 non-null int64 dtypes: float64(2), int64(7) memory usage: 54.1 KB
sns.pairplot(data=data,hue='Outcome',diag_kind='kde')
plt.show()
data.hist(figsize = (20,20))
array([[<AxesSubplot:title={'center':'Pregnancies'}>, <AxesSubplot:title={'center':'Glucose'}>, <AxesSubplot:title={'center':'BloodPressure'}>], [<AxesSubplot:title={'center':'SkinThickness'}>, <AxesSubplot:title={'center':'Insulin'}>, <AxesSubplot:title={'center':'BMI'}>], [<AxesSubplot:title={'center':'DiabetesPedigreeFunction'}>, <AxesSubplot:title={'center':'Age'}>, <AxesSubplot:title={'center':'Outcome'}>]], dtype=object)
data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
data.isnull().sum()
Pregnancies 0 Glucose 5 BloodPressure 35 SkinThickness 227 Insulin 374 BMI 11 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
# sns.heatmap(data.isnull())
def median_target(var):
temp = data[data[var].notnull()]
temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median()
temp2=temp.var
data.loc[(data['Outcome'] == 0 ) & (data[var].isnull()), var] =float(np.array(temp)[0])
data.loc[(data['Outcome'] == 1 ) & (data[var].isnull()), var] = float(np.array(temp)[1])
median_target('Insulin')
median_target('Glucose')
median_target('BloodPressure')
median_target('SkinThickness')
median_target('BMI')
data.isnull().sum()
Pregnancies 0 Glucose 0 BloodPressure 0 SkinThickness 0 Insulin 0 BMI 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
sns.heatmap(data.isnull())
<AxesSubplot:>
plt.figure(figsize=(10,10))
sns.heatmap(data.corr(), cmap="YlGnBu", annot= True,)
plt.show()
class evaluate_all_model:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import time
def __init__(self,x,y):
self.x=x
self.y=y
self.train_test_split()
self.define_models()
self.evaluate_model()
print("best model base on Accuracy")
print(self.best_model)
def train_test_split(self):
self.X_train, self.X_test, self.y_train,self.y_test =train_test_split(self.x, self.y,test_size=0.33, random_state=3)
sc=self.StandardScaler()
self.X_train = sc.fit_transform(self.X_train)
self.X_test = sc.transform(self.X_test)
def define_models(self):
self.models={'LogisticRegression': self.LogisticRegression(),
'RandomForestClassifier': self.RandomForestClassifier(),
'KNeighborsClassifier': self.KNeighborsClassifier(),
'DecisionTreeClassifier': self.DecisionTreeClassifier(),
'SupportVectorMachine':self.SVC(),
'GaussianNB': self.GaussianNB(),
'BernoulliNB': self.BernoulliNB(),
'GradientBoostingClassifier': self.GradientBoostingClassifier()
}
self.modelNames =['LogisticRegression', 'RandomForestClassifier','KNeighborsClassifier','DecisionTreeClassifier','SupportVectorMachine',
'GaussianNB','BernoulliNB','GradientBoostingClassifier']
self.trainScores = []
self.testScores = []
self.Time_taken=[]
self.best_model_score=0
self.best_model={}
self.less_time=123
def evaluate_model(self):
for i in self.models:
start = self.time.time()
model=self.models[i]
model.fit(self.X_train,self.y_train)
train_score = model.score(self.X_train, self.y_train)
self.trainScores.append(train_score)
print(f'Model:- {i}')
print(f'training score:- {train_score}')
test_score = model.score(self.X_test, self.y_test)
self.testScores.append(test_score)
print(f'test Score:- {test_score}')
y_predictions = model.predict(self.X_test)
conf_matrix = confusion_matrix(y_predictions, self.y_test)
print(f'Confussion Matrix: \n{conf_matrix}\n')
tn = conf_matrix[0,0]
fp = conf_matrix[0,1]
tp = conf_matrix[1,1]
fn = conf_matrix[1,0]
accuracy = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1score = 2 * precision * recall / (precision + recall)
specificity = tn / (tn + fp)
print(f'Accuracy : {accuracy}')
print(f'Precision: {precision}')
print(f'Recall : {recall}')
print(f'F1 score : {f1score}')
print(f'Specificity : {specificity}')
end = self.time.time()
time_taken=end-start
self.Time_taken.append(time_taken)
print(f'Time required {end-start}')
print("***************************************************************************")
print("____________________________________________________________________________")
print("\n\n\n")
if(float(test_score)>self.best_model_score):
self.best_model["model Name"]=i
self.best_model["Time Required on train and test"]=time_taken
self.best_model["Accuracy on train data"]=train_score
self.best_model["Accuracy on test data"]=accuracy
self.best_model_score=test_score
if(time_taken<self.less_time):
self.less_time=time_taken
def plot_bar(self):
plt.bar(np.arange(len(self.trainScores)), self.trainScores, color='blue', width=0.25, edgecolor='white', label='train')
plt.bar([x + 0.25 for x in np.arange(len(self.trainScores))], self.testScores, color='red', width=0.25, edgecolor='white', label='Test')
plt.xlabel('Models', fontweight='bold', size = 24)
plt.ylabel('Scores', fontweight='bold', size = 24)
plt.xticks([r-0.25 for r in range(len(self.trainScores))], self.modelNames, rotation = 60)
plt.legend()
plt.show()
def get_data(self):
self.temp_dict={}
self.temp_dict["Model"]=self.modelNames
self.temp_dict["Training Score"]=self.trainScores
self.temp_dict["Accuracy on Test"]=self.testScores
self.temp_dict["Time Taken"]=self.Time_taken
return self.temp_dict
def get_dataframe(self):
return pd.DataFrame.from_dict(at.get_data())
X = data.drop('Outcome',axis=1).values
y = data['Outcome'].values
at=evaluate_all_model(X,y)
Model:- LogisticRegression training score:- 0.7898832684824902 test Score:- 0.7677165354330708 Confussion Matrix: [[131 42] [ 17 64]] Accuracy : 0.7677165354330708 Precision: 0.6037735849056604 Recall : 0.7901234567901234 F1 score : 0.6844919786096257 Specificity : 0.7572254335260116 Time required 0.017589569091796875 *************************************************************************** ____________________________________________________________________________ Model:- RandomForestClassifier training score:- 1.0 test Score:- 0.8740157480314961 Confussion Matrix: [[132 16] [ 16 90]] Accuracy : 0.8740157480314961 Precision: 0.8490566037735849 Recall : 0.8490566037735849 F1 score : 0.8490566037735849 Specificity : 0.8918918918918919 Time required 0.7530696392059326 *************************************************************************** ____________________________________________________________________________ Model:- KNeighborsClassifier training score:- 0.8599221789883269 test Score:- 0.8070866141732284 Confussion Matrix: [[127 28] [ 21 78]] Accuracy : 0.8070866141732284 Precision: 0.7358490566037735 Recall : 0.7878787878787878 F1 score : 0.7609756097560976 Specificity : 0.8193548387096774 Time required 0.12662816047668457 *************************************************************************** ____________________________________________________________________________ Model:- DecisionTreeClassifier training score:- 1.0 test Score:- 0.8267716535433071 Confussion Matrix: [[131 27] [ 17 79]] Accuracy : 0.8267716535433071 Precision: 0.7452830188679245 Recall : 0.8229166666666666 F1 score : 0.7821782178217821 Specificity : 0.8291139240506329 Time required 0.012286663055419922 *************************************************************************** ____________________________________________________________________________ Model:- SupportVectorMachine training score:- 0.896887159533074 test Score:- 0.8228346456692913 Confussion Matrix: [[129 26] [ 19 80]] Accuracy : 0.8228346456692913 Precision: 0.7547169811320755 Recall : 0.8080808080808081 F1 score : 0.7804878048780488 Specificity : 0.832258064516129 Time required 0.0999448299407959 *************************************************************************** ____________________________________________________________________________ Model:- GaussianNB training score:- 0.7762645914396887 test Score:- 0.7480314960629921 Confussion Matrix: [[123 39] [ 25 67]] Accuracy : 0.7480314960629921 Precision: 0.6320754716981132 Recall : 0.7282608695652174 F1 score : 0.6767676767676767 Specificity : 0.7592592592592593 Time required 0.007964611053466797 *************************************************************************** ____________________________________________________________________________ Model:- BernoulliNB training score:- 0.8463035019455253 test Score:- 0.8622047244094488 Confussion Matrix: [[129 16] [ 19 90]] Accuracy : 0.8622047244094488 Precision: 0.8490566037735849 Recall : 0.8256880733944955 F1 score : 0.8372093023255816 Specificity : 0.8896551724137931 Time required 0.008002996444702148 *************************************************************************** ____________________________________________________________________________ Model:- GradientBoostingClassifier training score:- 0.9961089494163424 test Score:- 0.889763779527559 Confussion Matrix: [[137 17] [ 11 89]] Accuracy : 0.889763779527559 Precision: 0.839622641509434 Recall : 0.89 F1 score : 0.8640776699029127 Specificity : 0.8896103896103896 Time required 0.334606409072876 *************************************************************************** ____________________________________________________________________________ best model base on Accuracy {'model Name': 'GradientBoostingClassifier', 'Time Required on train and test': 0.334606409072876, 'Accuracy on train data': 0.9961089494163424, 'Accuracy on test data': 0.889763779527559}
at.get_dataframe()
Model | Training Score | Accuracy on Test | Time Taken | |
---|---|---|---|---|
0 | LogisticRegression | 0.789883 | 0.767717 | 0.017590 |
1 | RandomForestClassifier | 1.000000 | 0.874016 | 0.753070 |
2 | KNeighborsClassifier | 0.859922 | 0.807087 | 0.126628 |
3 | DecisionTreeClassifier | 1.000000 | 0.826772 | 0.012287 |
4 | SupportVectorMachine | 0.896887 | 0.822835 | 0.099945 |
5 | GaussianNB | 0.776265 | 0.748031 | 0.007965 |
6 | BernoulliNB | 0.846304 | 0.862205 | 0.008003 |
7 | GradientBoostingClassifier | 0.996109 | 0.889764 | 0.334606 |
at.plot_bar()
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=3,stratify=y)
sc=StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
model = GradientBoostingClassifier(n_estimators=530,max_features=8,random_state=2)
model.fit(X_train,y_train)
model.score(X_test,y_test)
0.9134199134199135
import pickle
pickle.dump(model, open('Diabetes_model', 'wb'))
pickle.dump(sc, open('Diabetes_standard_scaler', 'wb'))