Gaussian Naive Bayes

*Bayes Theorem:* $P(A|B) = \frac{P(B|A)P(A)}{P(B}$

*Naive Bayes:*

$\mathbf{P(y|X) = \frac{P(X|y)P(y)}{P(X}}$

$\mathbf{X = (x_{1}, x_{2}, x_{3}, x_{4}, x_{5},...,x_{n})}$

$\mathbf{P(y|X) = \frac{P(x_{1}|y).P(x_{2}|y)....P(x_{n}|y).P(y)}{P(X)}}$

In [1]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:

class NaiveBayes(object):
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)
        
        # mean, variance, priors
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors =  np.zeros(n_classes, dtype=np.float64)

        # extracting mean, variance and priors for each class
        # useful in calculating pdf during prediction
        for c in self._classes:
            X_c = X[y==c]
            self._mean[c, :] = X_c.mean(axis=0)
            self._var[c, :] = X_c.var(axis=0)
            self._priors[c] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            class_conditional = np.sum(np.log(self.gaussian_pdf(idx, x)))
            posterior = prior + class_conditional
            posteriors.append(posterior)
            
        # return class with highest posterior probability
        return self._classes[np.argmax(posteriors)]
            

    def gaussian_pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(- (x-mean)**2 / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

In [3]:

def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [4]:

from sklearn.datasets import load_iris

data = load_iris()

In [5]:

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)

In [6]:

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(112, 4)
(38, 4)
(112,)
(38,)

In [7]:

nb = NaiveBayes()
nb.fit(X_train, y_train.ravel())
y_pred = nb.predict(X_test)

In [8]:

print(f"Naive Bayes accuracy: {accuracy(y_test, y_pred)}")

Naive Bayes accuracy: 0.8947368421052632

In [9]:

# Diabetes Dataset

In [10]:

cols = ["Pregnancies" ,"Glucose" ,"BloodPressure" ,"SkinThickness" ,"Insulin" ,"BMI" ,"DiabetesPedigreeFunction" ,"Age" ,"Outcome"]
url = "https://gist.githubusercontent.com/ktisha/c21e73a1bd1700294ef790c56c8aec1f/raw/819b69b5736821ccee93d05b51de0510bea00294/pima-indians-diabetes.csv"

diabetes_data = pd.read_csv(url, skiprows=9, header=None, names=cols)
diabetes_data.head()

Out[10]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

In [11]:

diabetes_data.shape

Out[11]:

(768, 9)

In [12]:

X = diabetes_data[cols[:-1]].values
y = diabetes_data[cols[-1]].values

In [13]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

In [14]:

nb = NaiveBayes()
nb.fit(X_train, y_train.ravel())
y_pred = nb.predict(X_test)

In [15]:

print(f"Naive Bayes accuracy: {accuracy(y_test, y_pred)}")

Naive Bayes accuracy: 0.7532467532467533

In [16]:

from sklearn.metrics import precision_score, recall_score, f1_score

In [17]:

print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1-Score: {f1_score(y_test, y_pred)}")

Precision: 0.6428571428571429
Recall: 0.6666666666666666
F1-Score: 0.6545454545454545