import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")
class PCA:
def __init__(self, n_component, solver="svd"):
self.n_component = n_component
self.solver=solver
self.components = None
self.mean = None
def fit(self, X):
self.mean = X.mean(axis=0)
X = X - self.mean
# expects row=feature, column=sample
# cov = np.cov(X.T)
cov = (X - X.mean(axis=0)).T.dot(X - X.mean(axis=0)) / (X.shape[0] - 1)
if self.solver == "eig":
# eigenvalue[i] -> eigenvector[:, i]
eigenvalues, eigenvectors = np.linalg.eig(cov)
eigenvectors = eigenvectors.T
idxs = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[idxs]
eigenvectors = eigenvectors[idxs]
self.components = eigenvectors[0:self.n_component]
else:
# SVD
_, S, Vt = np.linalg.svd(X)
idxs = np.argsort(S)[::-1]
S = S[idxs]
Vt = Vt[idxs]
self.components = Vt[0:self.n_component]
def transform(self, X):
X = X - self.mean
return np.dot(X, self.components.T)
def fit_transform(self, X):
self.fit(X)
return self.transform(X)
diabetes_data = pd.read_csv(r'../datasets/diabetes_data.csv')
diabetes_data.head()
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
X = diabetes_data[diabetes_data.columns[:-1]].values
y = diabetes_data[diabetes_data.columns[-1]].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
# using Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("Before feature extraction")
print(f"Number of features of X: {X_train.shape[1]}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
Before feature extraction Number of features of X: 8 Accuracy: 0.7142857142857143
# Transforming Dataset
pca = PCA(n_component=6)
X_transformed = pca.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, stratify=y, random_state=42)
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("After feature extraction")
print(f"Number of features of X: {X_train.shape[1]}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
After feature extraction Number of features of X: 6 Accuracy: 0.7337662337662337
Reducing number of features for visualization
data = load_iris()
X = data.data
y = data.target
pca = PCA(n_component=2)
X_projected = pca.fit_transform(X)
print(f"Shape of X: {X.shape}")
print(f"Shape of transformed X: {X_projected.shape}")
Shape of X: (150, 4) Shape of transformed X: (150, 2)
x1 = X_projected[:, 0]
x2 = X_projected[:, 1]
X_trans = np.c_[x1, x2, y]
X_trans.shape
(150, 3)
colors = ["red", "blue", "green"]
for i in range(3):
plt.scatter(X_trans[X_trans[:, 2] == i][:, 0], X_trans[X_trans[:, 2] == i][:, 1], c=colors[i],
edgecolors="k", alpha=0.7, label=data.target_names[i])
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.show()