In [37]:

# Imports libraries
import sklearn
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import os

print pd.__version__

0.20.3

In [38]:

# reading table as dataframe
DATA_DIR = '../data'

df = pd.read_table(
                    os.path.abspath(os.path.join(DATA_DIR, 'day8/titanic.csv')),
                    sep=',', 
                    index_col='PassengerId'
                  )
df.head(5)

Out[38]:

	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
PassengerId
1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S

In [39]:

df.shape

Out[39]:

(891, 11)

In [40]:

# identifiying the missing values across all the colums
df.isnull().sum()

Out[40]:

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [41]:

# dropping name, cabin,  Fare, Embarked ticket columns
# Name does not give any information if a person will live or not
# cabin, Ticket, Fare are correlated to eachother and to PClass; so removing
df.drop(['Cabin', 'Ticket',  'Name', 'Fare', 'Embarked'], axis=1, inplace=True)

In [42]:

# fill age mean to NaN value
df[df['Age'].isnull()] = df['Age'].mean()

In [43]:

# check for any other NaN value
df.isnull().sum()

Out[43]:

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
dtype: int64

In [44]:

df.head(5)

Out[44]:

	Survived	Pclass	Sex	Age	SibSp	Parch
PassengerId
1	0.0	3.0	male	22.0	1.0	0.0
2	1.0	1.0	female	38.0	1.0	0.0
3	1.0	3.0	female	26.0	0.0	0.0
4	1.0	1.0	female	35.0	1.0	0.0
5	0.0	3.0	male	35.0	0.0	0.0

In [45]:

# converting Sex to numbers
replacements_sex = {'male': 0, 'female': 1}
df['Sex'].replace(replacements_sex, inplace=True)

In [46]:

df.head(6)

Out[46]:

	Survived	Pclass	Sex	Age	SibSp	Parch
PassengerId
1	0.000000	3.000000	0.000000	22.000000	1.000000	0.000000
2	1.000000	1.000000	1.000000	38.000000	1.000000	0.000000
3	1.000000	3.000000	1.000000	26.000000	0.000000	0.000000
4	1.000000	1.000000	1.000000	35.000000	1.000000	0.000000
5	0.000000	3.000000	0.000000	35.000000	0.000000	0.000000
6	29.699118	29.699118	29.699118	29.699118	29.699118	29.699118

In [47]:

# removing duplicates
# redundancy does not help our model to generalize better; introduces biasness
df.drop_duplicates(keep=False, inplace=True)

In [48]:

# check if there is class imbalance ?
df['Survived'].value_counts()

# this would work; otherwise you can try stratified split instead of random data split

Out[48]:

0.0    202
1.0    197
Name: Survived, dtype: int64

In [49]:

X = df.iloc[:, 1:].values
Y = df.iloc[:, 0].values

In [50]:

X.shape, Y.shape

Out[50]:

((399, 5), (399,))

In [51]:

# ideal practice is to use test as 20% - 30% of training data
# defined by test_size in train_test_split()
# random_state is required to avoid sequential biasness in the data distribution
def data_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.3, random_state = 10)
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = data_split(X, Y)

In [52]:

print X_train.shape, X_test.shape

(279, 5) (120, 5)

In [53]:

# this class takes care for scaling the features to the scale of 0-1
# we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which 
# also has the range from 0-1
class Normalizer:

    def __init__(self):
        self.sc = StandardScaler()
    
    def scale(self, X, dtype):
        if dtype=='train':
            XX = self.sc.fit_transform(X)
        elif dtype=='test':
            XX = self.sc.transform(X)
        else:
            return None
        return XX

In [54]:

norm = Normalizer()
X_train = norm.scale(X_train, 'train')
X_test = norm.scale(X_test, 'test')

In [55]:

class NaiveBayes:
    
    def __init__(self):
        self.classifier = GaussianNB()

    def train(self, X_train, Y_train):
        model = self.classifier.fit(X_train, Y_train)
        return model
    
    def predict(self, model, X_test):
        return model.predict(X_test)
    
    def evaluate(self, Y_test, Y_pred, measure):
        if measure=='matrix':
            cm = sklearn.metrics.confusion_matrix(Y_test, Y_pred, labels=[0, 1])
            return cm
        elif measure=='accuracy':
            return sklearn.metrics.accuracy_score(Y_test, Y_pred)*100
        else: return None

In [56]:

nb = NaiveBayes()
model = nb.train(X_train, Y_train)
predictions = nb.predict(model, X_test)
print nb.evaluate(Y_test, predictions, 'accuracy')

71.66666666666667