In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

Main

In [2]:

class LogisticRegression(object):
    
    def __init__(self, learning_rate=0.001, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None
        self.cost_per_iteration = []
        
    
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    
    def fit(self, X, y):
        m, n = X.shape
        self.weights = np.zeros(n)
        self.bias = 0
        
        # gradient descent

        for _ in range(self.epochs):
            # forward propagation
            Z = np.dot(X, self.weights) + self.bias
            A = self._sigmoid(Z)
            
            cost = (-1/m) * np.sum(y * np.log(A) + (1 - y) * np.log(1 - A))
            
            # backward propagation
            dw = (1 / m) * np.dot(X.T, (A - y))
            db = (1 / m) * np.sum(A - y)
            
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            
            self.cost_per_iteration.append(cost)
    
    def predict(self, X):
        Z = np.dot(X, self.weights) + self.bias
        A = self._sigmoid(Z)
        y_predicted_cls = [1 if i >= 0.5 else 0 for i in A]
        return y_predicted_cls

Testing

In [3]:

cols = ["Pregnancies" ,"Glucose" ,"BloodPressure" ,"SkinThickness" ,"Insulin" ,"BMI" ,"DiabetesPedigreeFunction" ,"Age" ,"Outcome"]
url = "https://gist.githubusercontent.com/ktisha/c21e73a1bd1700294ef790c56c8aec1f/raw/819b69b5736821ccee93d05b51de0510bea00294/pima-indians-diabetes.csv"

diabetes_data = pd.read_csv(url, skiprows=9, header=None, names=cols)
diabetes_data.head()

Out[3]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

In [4]:

diabetes_data.shape

Out[4]:

(768, 9)

In [5]:

# X = diabetes_data[['Glucose','BMI']]
X = diabetes_data[cols[:-1]]
y = diabetes_data[cols[-1]]

In [6]:

scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

In [7]:

X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, shuffle=True, random_state=42)

In [8]:

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(614, 8)
(614,)
(154, 8)
(154,)

In [9]:

# means = np.mean(X_train, axis=0)
# stds = np.std(X_train, axis=0)
# X_train = (X_train - means)/stds
# X_test = (X_test - means)/stds

In [10]:

def getAccuracy(predicted, actual):
    assert len(predicted) == len(actual)
    accuracy = np.sum(predicted == actual)
    return (accuracy/len(actual)) * 100.0

In [11]:

epochs = 6000

learning_rate = 1e-1

logisticRegression = LogisticRegression(learning_rate=learning_rate, epochs=epochs)
logisticRegression.fit(X_train, y_train)
predictions = logisticRegression.predict(X_test)

print(f"Accuracy achieved: {round(getAccuracy(predictions, y_test), 2)}")

Accuracy achieved: 76.62

In [12]:

plt.plot(range(epochs), logisticRegression.cost_per_iteration)
plt.xlabel("Iterations")
plt.ylabel("Cost")
plt.title("Cost Vs. Iteration")

Out[12]:

Text(0.5, 1.0, 'Cost Vs. Iteration')

In [13]:

# Confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, predictions)
print(cm)

[[83 16]
 [20 35]]

In [14]:

def precision_recall(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)    
    tp = cm[1,1]
    fp = cm[0,1]
    fn = cm[1,0]
    prec = tp / (tp + fp)
    rec = tp / (tp + fn)    
    return prec, rec

precision, recall = precision_recall(y_test, predictions)
print('Precision: %f Recall %f' % (precision, recall))

Precision: 0.686275 Recall 0.636364

In [15]:

# from sklearn.metrics import precision_score, recall_score

# precision_score(y_test, predictions)
# recall_score(y_test, predictions)

In [16]:

# roc curve 
from sklearn.metrics import roc_curve

fpr, tpr, thresholdsh = roc_curve(y_test, predictions)

In [17]:

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16)
    plt.ylabel('True Positive Rate (Recall)', fontsize=16)
    plt.grid(True)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)

In [18]:

from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, predictions)

Out[18]:

0.7373737373737373