# Imports libraries
import sklearn
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import os
print pd.__version__
0.20.3
# reading table as dataframe
DATA_DIR = '../data'
df = pd.read_table(
os.path.abspath(os.path.join(DATA_DIR, 'day8/titanic.csv')),
sep=',',
index_col='PassengerId'
)
df.head(5)
Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
PassengerId | |||||||||||
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
df.shape
(891, 11)
# identifiying the missing values across all the colums
df.isnull().sum()
Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64
# dropping name, cabin, Fare, Embarked ticket columns
# Name does not give any information if a person will live or not
# cabin, Ticket, Fare are correlated to eachother and to PClass; so removing
df.drop(['Cabin', 'Ticket', 'Name', 'Fare', 'Embarked'], axis=1, inplace=True)
# fill age mean to NaN value
df[df['Age'].isnull()] = df['Age'].mean()
# check for any other NaN value
df.isnull().sum()
Survived 0 Pclass 0 Sex 0 Age 0 SibSp 0 Parch 0 dtype: int64
df.head(5)
Survived | Pclass | Sex | Age | SibSp | Parch | |
---|---|---|---|---|---|---|
PassengerId | ||||||
1 | 0.0 | 3.0 | male | 22.0 | 1.0 | 0.0 |
2 | 1.0 | 1.0 | female | 38.0 | 1.0 | 0.0 |
3 | 1.0 | 3.0 | female | 26.0 | 0.0 | 0.0 |
4 | 1.0 | 1.0 | female | 35.0 | 1.0 | 0.0 |
5 | 0.0 | 3.0 | male | 35.0 | 0.0 | 0.0 |
# converting Sex to numbers
replacements_sex = {'male': 0, 'female': 1}
df['Sex'].replace(replacements_sex, inplace=True)
df.head(6)
Survived | Pclass | Sex | Age | SibSp | Parch | |
---|---|---|---|---|---|---|
PassengerId | ||||||
1 | 0.000000 | 3.000000 | 0.000000 | 22.000000 | 1.000000 | 0.000000 |
2 | 1.000000 | 1.000000 | 1.000000 | 38.000000 | 1.000000 | 0.000000 |
3 | 1.000000 | 3.000000 | 1.000000 | 26.000000 | 0.000000 | 0.000000 |
4 | 1.000000 | 1.000000 | 1.000000 | 35.000000 | 1.000000 | 0.000000 |
5 | 0.000000 | 3.000000 | 0.000000 | 35.000000 | 0.000000 | 0.000000 |
6 | 29.699118 | 29.699118 | 29.699118 | 29.699118 | 29.699118 | 29.699118 |
# removing duplicates
# redundancy does not help our model to generalize better; introduces biasness
df.drop_duplicates(keep=False, inplace=True)
# check if there is class imbalance ?
df['Survived'].value_counts()
# this would work; otherwise you can try stratified split instead of random data split
0.0 202 1.0 197 Name: Survived, dtype: int64
X = df.iloc[:, 1:].values
Y = df.iloc[:, 0].values
X.shape, Y.shape
((399, 5), (399,))
# ideal practice is to use test as 20% - 30% of training data
# defined by test_size in train_test_split()
# random_state is required to avoid sequential biasness in the data distribution
def data_split(X, Y):
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.3, random_state = 10)
return X_train, X_test, Y_train, Y_test
X_train, X_test, Y_train, Y_test = data_split(X, Y)
print X_train.shape, X_test.shape
(279, 5) (120, 5)
# this class takes care for scaling the features to the scale of 0-1
# we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which
# also has the range from 0-1
class Normalizer:
def __init__(self):
self.sc = StandardScaler()
def scale(self, X, dtype):
if dtype=='train':
XX = self.sc.fit_transform(X)
elif dtype=='test':
XX = self.sc.transform(X)
else:
return None
return XX
norm = Normalizer()
X_train = norm.scale(X_train, 'train')
X_test = norm.scale(X_test, 'test')
class NaiveBayes:
def __init__(self):
self.classifier = GaussianNB()
def train(self, X_train, Y_train):
model = self.classifier.fit(X_train, Y_train)
return model
def predict(self, model, X_test):
return model.predict(X_test)
def evaluate(self, Y_test, Y_pred, measure):
if measure=='matrix':
cm = sklearn.metrics.confusion_matrix(Y_test, Y_pred, labels=[0, 1])
return cm
elif measure=='accuracy':
return sklearn.metrics.accuracy_score(Y_test, Y_pred)*100
else: return None
nb = NaiveBayes()
model = nb.train(X_train, Y_train)
predictions = nb.predict(model, X_test)
print nb.evaluate(Y_test, predictions, 'accuracy')
71.66666666666667