Import¶

In [1]:

import pandas as pd
import numpy as np
import sklearn
import os
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

print sklearn.__version__
print np.__version__
print pd.__version__

0.19.1
1.14.2
0.20.3

/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Loading and Describe¶

In [2]:

DATA_DIR = '../data'
df = pd.read_table(
                    os.path.abspath(os.path.join(DATA_DIR, 'day1/iris.csv')),
                    sep=','
                  )
df.head(5)

Out[2]:

	feat1	feat2	feat3	feat4	class
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa

In [3]:

# examples per class
df.groupby('class')['class'].count()

Out[3]:

class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: class, dtype: int64

In [4]:

# feat1/2/3/4 are considered as input (features) to the model, whereas class is considered as output of the model
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values
# encode the class with integers
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

Train/Test Split¶

In [5]:

# ideal practice is to use test as 20% - 30% of training data
# defined by test_size in train_test_split()
# random_state is required to avoid sequential biasness in the data distribution
def data_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = data_split(X, Y)

In [6]:

X_train.shape, X_test.shape

Out[6]:

((120, 4), (30, 4))

Normalizer Class¶

In [7]:

# this class takes care for scaling the features to the scale of 0-1
# we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which 
# also has the range from 0-1
class Normalizer:

    def __init__(self):
        self.sc = StandardScaler()
    
    def scale(self, X, dtype):
        if dtype=='train':
            XX = self.sc.fit_transform(X)
        elif dtype=='test':
            XX = self.sc.transform(X)
        else:
            return None
        return XX

Logistic Model Class¶

In [8]:

class LogisticModel:
    
    def __init__(self):
        self.classifier = LogisticRegression()

    def train(self, X_train, Y_train):
        model = self.classifier.fit(X_train, Y_train)
        return model
    
    def predict(self, model, X_test):
        return model.predict(X_test)
    
    def evaluate(self, Y_test, Y_pred, measure):
        if measure=='matrix':
            cm = confusion_matrix(Y_test, Y_pred, labels=[0, 1, 2])
            return cm
        elif measure=='accuracy':
            return accuracy_score(Y_test, Y_pred)*100
        else: return None

Starting to Train and Predict¶

In [9]:

# cap to range of 0-1
norm = Normalizer()
X_train = norm.scale(X_train, 'train')
X_test = norm.scale(X_test, 'test')

In [10]:

# train the model
logit = LogisticModel()
model = logit.train(X_train, Y_train)
predictions = logit.predict(model, X_test)

Evaluating¶

In [11]:

print logit.evaluate(Y_test, predictions, 'matrix')
print 
print logit.evaluate(Y_test, predictions, 'accuracy')

[[10  0  0]
 [ 0 10  3]
 [ 0  0  7]]

90.0

In [12]:

#                iris  versicolor  verginica
# iris       -    10      10          0
# versicolor -    0       10          3
# verginica  -    0       0           7