Bayesian Logistic Regression on the Kaggle titanic dataset¶

In [1]:

%matplotlib inline
import pandas as pd
import seaborn
import numpy as np
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [10]:

import qgrid
qgrid.nbinstall()

In [11]:

from pymc3 import Model, Normal, Bernoulli, find_MAP, NUTS, sample, sum, traceplot

Load dataset¶

In [12]:

train_data = pd.read_csv('input/train.csv')
test_data = pd.read_csv('input/test.csv')

Fill in missing values in Age column¶

In [13]:

mean_age = np.mean(train_data.loc[~np.isnan(train_data['Age']), 'Age'])
train_data.loc[np.isnan(train_data['Age']), 'Age'] = mean_age
test_data.loc[np.isnan(test_data['Age']), 'Age'] = mean_age

In [15]:

qgrid.show_grid(train_data, remote_js=True)

Prepare the training data¶

In [7]:

y = train_data["Survived"]

In [8]:

x = train_data[["Pclass", "Age"]]
x.loc[:, "Intercept"] = 1
x.loc[:, "Sex"] = ((train_data["Sex"] == "male") * 1)

/usr/local/lib/python3.4/dist-packages/pandas/core/indexing.py:249: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
/usr/local/lib/python3.4/dist-packages/pandas/core/indexing.py:415: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s

Build the model¶

In [8]:

import theano.tensor as t
def tinvlogit(x):
    return t.exp(x) / (1 + t.exp(x))

In [9]:

logistic_model = Model()

with logistic_model:
    effects = Normal('effects', mu=0, tau=2. ** -2, shape=(1, x.shape[1]))
    
    # expected parameter
    p = tinvlogit(t.sum(effects * x, axis=1))
    
    # Likelihood (sampling distribution) of observations
    Y_obs = Bernoulli('Y_obs', p=p, observed=y)

In [10]:

with logistic_model:
    # move the chain to the MAP which should be a good starting point
    start = find_MAP()
    step = NUTS(scaling=start)
    trace = sample(5000, step, start)

 [-----------------100%-----------------] 5000 of 5000 complete in 61.5 sec

In [11]:

traceplot(trace)

Out[11]:

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fa8eb2e3ac8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fa8eb0b2d30>]], dtype=object)

In [12]:

trace['effects']

Out[12]:

array([[[-1.09676879, -0.02903348,  4.48974061, -2.45753634]],

       [[-1.09676879, -0.02903348,  4.48974061, -2.45753634]],

       [[-1.16002106, -0.0338922 ,  4.51968149, -2.55373352]],

       ..., 
       [[-1.24548209, -0.03337827,  5.10623622, -3.00418852]],

       [[-1.13716367, -0.03495873,  4.4685661 , -2.53877428]],

       [[-1.13443239, -0.02604646,  4.49673206, -2.44640603]]])

Prepare the testing data¶

In [13]:

new_x = test_data[["Pclass", "Age"]]
new_x.loc[:, "Intercept"] = 1
new_x.loc[:, "Sex"] = (test_data["Sex"] == "male") * 1
# new_x = new_x.drop("Pclass", 1)

In [14]:

def invlogit(x):
    return np.exp(x) / (1 + np.exp(x))

Take average of all the eta's implied by the samples¶

In [15]:

eta = np.zeros(new_x.shape[0])
for row in trace['effects']:
    row=row[0]
    eta = np.add(eta, np.dot(new_x, row.transpose()))
eta = np.divide(eta, trace['effects'].shape[0])

In [16]:

p = invlogit(eta)
p[:20]

Out[16]:

array([ 0.07725205,  0.42751563,  0.09950293,  0.09527585,  0.61603883,
        0.13550032,  0.5567653 ,  0.24945878,  0.64454149,  0.11231489,
        0.0883926 ,  0.35559176,  0.93581827,  0.09679533,  0.87495842,
        0.8220575 ,  0.20152091,  0.11231489,  0.57927939,  0.44255144])

Prepare Submission¶

In [17]:

prediction = pd.DataFrame({'Survived_p':p})

In [18]:

prediction["PassengerId"] = test_data["PassengerId"]
prediction['Survived'] = (prediction['Survived_p']>0.5) * 1

In [19]:

prediction.drop('Survived_p', 1).to_csv('output.csv', index=False)

In [20]:

prediction['Sex'] = test_data["Sex"]

In [21]:

prediction.head()

Out[21]:

	Survived_p	PassengerId	Survived	Sex
0	0.077	892	0	male
1	0.428	893	0	female
2	0.100	894	0	male
3	0.095	895	0	male
4	0.616	896	1	female