import numpy as np
import pandas as pd
import sklearn
import os
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
# reading data into pandas dataframe
DATA_DIR = 'data'
df = pd.read_table(
os.path.abspath(os.path.join(DATA_DIR, 'day1/iris.csv')),
sep=','
)
df.head(5)
feat1 | feat2 | feat3 | feat4 | class | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
# encoding the class to integers
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values
# encode the class with integers
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)
# ideal practice is to use test as 20% - 30% of training data
# defined by test_size in train_test_split()
# random_state is required to avoid sequential biasness in the data distribution
def data_split(X, Y):
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.30, random_state = 10)
return X_train, X_test, Y_train, Y_test
X_train, X_test, Y_train, Y_test = data_split(X, Y)
print X_train.shape, X_test.shape
(105, 4) (45, 4)
# this class takes care for scaling the features to the scale of 0-1
# we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which
# also has the range from 0-1
class Normalizer:
def __init__(self):
self.sc = StandardScaler()
def scale(self, X, dtype):
if dtype=='train':
XX = self.sc.fit_transform(X)
elif dtype=='test':
XX = self.sc.transform(X)
else:
return None
return XX
norm = Normalizer()
X_train = norm.scale(X_train, 'train')
X_test = norm.scale(X_test, 'test')
from sklearn.linear_model import LogisticRegression
# train the model
classifier = LogisticRegression()
model = classifier.fit(X_train, Y_train)
predictions_lr = model.predict_proba(X_test)
print sklearn.metrics.accuracy_score(Y_test, np.argmax(predictions_lr, axis=1))
0.8666666666666667
from sklearn import tree
# train the model
classifier = tree.DecisionTreeClassifier()
model = classifier.fit(X_train, Y_train)
predictions_dtree = model.predict_proba(X_test)
print sklearn.metrics.accuracy_score(Y_test, np.argmax(predictions_dtree, axis=1))
0.9777777777777777
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
model = knn.fit(X_train, Y_train)
predictions_knn = model.predict_proba(X_test)
print sklearn.metrics.accuracy_score(Y_test, np.argmax(predictions_knn, axis=1))
0.9555555555555556
class Ensemble(object):
"""
Implements averaging voting ensemble technique
Each model is given equal weight
"""
def __init__(self, samples=None, classes=None, classifiers=None):
self.classes = classes
self.samples = samples
self.classifiers = classifiers
def mixmatch(self, predictions):
if not self.classifiers:
self.classifiers = len(predictions)
if not self.samples:
self.samples = len(predictions[0])
if not self.classes:
self.classes = len(predictions[0][0])
final_pred = np.array([0]*self.classes)
for s in range(self.samples):
s_pred = np.array([0]*self.classes)
for c in range(self.classifiers):
pred = predictions[c][s]
s_pred = np.vstack((s_pred, pred))
s_pred = s_pred[1:, :]
s_pred_avg = np.average(s_pred, axis=0)
final_pred = np.vstack((final_pred, s_pred_avg))
return final_pred[1:, :]
ensemble = Ensemble(45, 3, 3)
pred = np.argmax(ensemble.mixmatch([predictions_lr, predictions_dtree, predictions_knn]), axis=1)
print sklearn.metrics.accuracy_score(Y_test, pred)
1.0