In [80]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import sklearn.metrics as metrics
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier

4.6.1 The Stock Market Data¶

In [160]:

df = pd.read_csv('smarket.csv')
smarket_dat = df.drop(df.columns[0], axis=1)
smarket_dat['Direction'] = smarket_dat['Direction'].map({'Up': 1, 'Down': 0})
smarket_dat.head()

Out[160]:

	Year	Lag1	Lag2	Lag3	Lag4	Lag5	Volume	Today	Direction
0	2001	0.381	-0.192	-2.624	-1.055	5.010	1.1913	0.959	1
1	2001	0.959	0.381	-0.192	-2.624	-1.055	1.2965	1.032	1
2	2001	1.032	0.959	0.381	-0.192	-2.624	1.4112	-0.623	0
3	2001	-0.623	1.032	0.959	0.381	-0.192	1.2760	0.614	1
4	2001	0.614	-0.623	1.032	0.959	0.381	1.2057	0.213	1

In [161]:

smarket_dat.describe()

Out[161]:

	Year	Lag1	Lag2	Lag3	Lag4	Lag5	Volume	Today	Direction
count	1250.000000	1250.000000	1250.000000	1250.000000	1250.000000	1250.00000	1250.000000	1250.000000	1250.000000
mean	2003.016000	0.003834	0.003919	0.001716	0.001636	0.00561	1.478305	0.003138	0.518400
std	1.409018	1.136299	1.136280	1.138703	1.138774	1.14755	0.360357	1.136334	0.499861
min	2001.000000	-4.922000	-4.922000	-4.922000	-4.922000	-4.92200	0.356070	-4.922000	0.000000
25%	2002.000000	-0.639500	-0.639500	-0.640000	-0.640000	-0.64000	1.257400	-0.639500	0.000000
50%	2003.000000	0.039000	0.039000	0.038500	0.038500	0.03850	1.422950	0.038500	1.000000
75%	2004.000000	0.596750	0.596750	0.596750	0.596750	0.59700	1.641675	0.596750	1.000000
max	2005.000000	5.733000	5.733000	5.733000	5.733000	5.73300	3.152470	5.733000	1.000000

In [162]:

_, ax = plt.subplots(figsize=(10,10))
sns.heatmap(smarket_dat.corr(), annot=True, ax=ax)

Out[162]:

<matplotlib.axes._subplots.AxesSubplot at 0x11a63f7b8>

In [163]:

_, ax = plt.subplots(figsize=(10,10))
sns.regplot(ax=ax, x='Year', y='Volume', data=smarket_dat)

Out[163]:

<matplotlib.axes._subplots.AxesSubplot at 0x11a0c9f98>

In [164]:

fig, ax = plt.subplots(figsize=(10,10))
sns.regplot(ax=ax, x='index', y='Volume', data=smarket_dat.reset_index())

Out[164]:

<matplotlib.axes._subplots.AxesSubplot at 0x11a8b7ef0>

4.6.2 Logistic Regression¶

In [165]:

logit_model = smf.logit(formula='Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume', data=smarket_dat).fit()
logit_model.summary()

Optimization terminated successfully.
         Current function value: 0.691034
         Iterations 4

Out[165]:

Logit Regression Results
Dep. Variable:	Direction	No. Observations:	1250
Model:	Logit	Df Residuals:	1243
Method:	MLE	Df Model:	6
Date:	Wed, 19 Sep 2018	Pseudo R-squ.:	0.002074
Time:	17:58:15	Log-Likelihood:	-863.79
converged:	True	LL-Null:	-865.59
		LLR p-value:	0.7319

	coef	std err	z	P>\|z\|	[0.025	0.975]
Intercept	-0.1260	0.241	-0.523	0.601	-0.598	0.346
Lag1	-0.0731	0.050	-1.457	0.145	-0.171	0.025
Lag2	-0.0423	0.050	-0.845	0.398	-0.140	0.056
Lag3	0.0111	0.050	0.222	0.824	-0.087	0.109
Lag4	0.0094	0.050	0.187	0.851	-0.089	0.107
Lag5	0.0103	0.050	0.208	0.835	-0.087	0.107
Volume	0.1354	0.158	0.855	0.392	-0.175	0.446

In [166]:

pred = logit_model.predict(smarket_dat).map(lambda x: 1 if x > 0.5 else 0)
conf_mtrx = metrics.confusion_matrix(smarket_dat['Direction'], pred)
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index= ['Down', 'Up']).T

Out[166]:

	Down	Up
Down	145	141
Up	457	507

In [109]:

(145 + 507) / (145 + 141 + 457 + 507)

Out[109]:

0.5216

In [113]:

(pred == smarket_dat['Direction']).sum() / smarket_dat.shape[0]

Out[113]:

0.5216

In [116]:

(pred == smarket_dat['Direction']).mean()

Out[116]:

0.5216

In [167]:

year_mask = smarket_dat['Year'] < 2005
pre = smarket_dat[year_mask]
post = smarket_dat[year_mask == False]

logit_model = smf.logit(formula='Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume', data=pre).fit()
pred = logit_model.predict(post).map(lambda x: 1 if x > 0.5 else 0)
conf_mtrx = metrics.confusion_matrix(post['Direction'], pred)
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index= ['Down', 'Up']).T

Optimization terminated successfully.
         Current function value: 0.691936
         Iterations 4

Out[167]:

	Down	Up
Down	77	97
Up	34	44

In [148]:

(pred == post['Direction']).mean()

Out[148]:

0.4801587301587302

In [168]:

year_mask = smarket_dat['Year'] < 2005
pre = smarket_dat[year_mask]
post = smarket_dat[year_mask == False]

logit_model = smf.logit(formula='Direction ~ Lag1 + Lag2', data=pre).fit()
pred = logit_model.predict(post).map(lambda x: 1 if x > 0.5 else 0)
conf_mtrx = metrics.confusion_matrix(post['Direction'], pred)
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index= ['Down', 'Up']).T

Optimization terminated successfully.
         Current function value: 0.692085
         Iterations 3

Out[168]:

	Down	Up
Down	35	35
Up	76	106

In [169]:

(pred == post['Direction']).mean()

Out[169]:

0.5595238095238095

In [170]:

logit_model.predict(pd.DataFrame({'Lag1' : [1.2, 1.5], 'Lag2' : [1.1, -0.8]}))

Out[170]:

0    0.479146
1    0.496094
dtype: float64

4.6.3 Linear Discriminant Analysis¶

In [373]:

lda = LinearDiscriminantAnalysis()
lda.fit(pre[['Lag1', 'Lag2']], pre['Direction'])

# Group means
pd.DataFrame(lda.means_, columns=['Lag1', 'Lag2'], index=['Down', 'Up'])

Out[373]:

	Lag1	Lag2
Down	0.042790	0.033894
Up	-0.039546	-0.031325

In [207]:

# Priors
pd.DataFrame(lda.priors_, index= ['Down', 'Up'], columns=['Prior'])

Out[207]:

	Prior
Down	0.491984
Up	0.508016

In [217]:

# Coefficients - NB: sklearn calls these `scalings`
pd.DataFrame(lda.scalings_, columns=['LD1'], index=['Lag1', 'Lag2'])

Out[217]:

	LD1
Lag1	-0.642019
Lag2	-0.513529

In [374]:

LD1 = pre[['Lag1', 'Lag2']] @ lda.scalings_
LD1.columns = ['LD1']
LD1['Direction'] = pre['Direction']
LD1['c'] = 0
_, ax = plt.subplots(figsize=(10,5))
sns.scatterplot('LD1', y='c', data=LD1, hue='Direction', ax=ax)

Out[374]:

<matplotlib.axes._subplots.AxesSubplot at 0x120d86a58>

In [380]:

# Prediction accuracy and confusion matrix
preds = lda.predict(post[['Lag1', 'Lag2']])
conf_mtrx = metrics.confusion_matrix(preds, post['Direction'])
print((preds == post['Direction']).mean())
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up'])

0.5595238095238095

Out[380]:

	Down	Up
Down	35	35
Up	76	106

In [382]:

# Get class probabilities
lda.predict_proba(post[['Lag1', 'Lag2']])

Out[382]:

array([[0.49017925, 0.50982075],
       [0.4792185 , 0.5207815 ],
       [0.46681848, 0.53318152],
       [0.47400107, 0.52599893],
       [0.49278766, 0.50721234],
       [0.49385615, 0.50614385],
       [0.49510156, 0.50489844],
       [0.4872861 , 0.5127139 ],
       [0.49070135, 0.50929865],
       [0.48440262, 0.51559738],
       [0.49069628, 0.50930372],
       [0.51199885, 0.48800115],
       [0.48951523, 0.51048477],
       [0.47067612, 0.52932388],
       [0.47445929, 0.52554071],
       [0.47995834, 0.52004166],
       [0.49357753, 0.50642247],
       [0.50308938, 0.49691062],
       [0.49788061, 0.50211939],
       [0.48863309, 0.51136691],
       [0.50065681, 0.49934319],
       [0.51087353, 0.48912647],
       [0.50399248, 0.49600752],
       [0.49163351, 0.50836649],
       [0.50417721, 0.49582279],
       [0.50267505, 0.49732495],
       [0.49140429, 0.50859571],
       [0.48059641, 0.51940359],
       [0.48827181, 0.51172819],
       [0.50621869, 0.49378131],
       [0.50059958, 0.49940042],
       [0.49729649, 0.50270351],
       [0.49585462, 0.50414538],
       [0.48117774, 0.51882226],
       [0.48414175, 0.51585825],
       [0.47263882, 0.52736118],
       [0.48364175, 0.51635825],
       [0.50910066, 0.49089934],
       [0.51359414, 0.48640586],
       [0.49338391, 0.50661609],
       [0.49268564, 0.50731436],
       [0.4978472 , 0.5021528 ],
       [0.49209142, 0.50790858],
       [0.50563459, 0.49436541],
       [0.50622877, 0.49377123],
       [0.48818939, 0.51181061],
       [0.47252929, 0.52747071],
       [0.48323391, 0.51676609],
       [0.48350857, 0.51649143],
       [0.49133344, 0.50866656],
       [0.48775664, 0.51224336],
       [0.47243859, 0.52756141],
       [0.48548774, 0.51451226],
       [0.49329107, 0.50670893],
       [0.48459731, 0.51540269],
       [0.47237179, 0.52762821],
       [0.48161704, 0.51838296],
       [0.49140673, 0.50859327],
       [0.4942755 , 0.5057245 ],
       [0.48412321, 0.51587679],
       [0.50260644, 0.49739356],
       [0.50625572, 0.49374428],
       [0.48218003, 0.51781997],
       [0.48852631, 0.51147369],
       [0.50118249, 0.49881751],
       [0.50005949, 0.49994051],
       [0.50273766, 0.49726234],
       [0.48700861, 0.51299139],
       [0.48272133, 0.51727867],
       [0.49965006, 0.50034994],
       [0.4818079 , 0.5181921 ],
       [0.4651057 , 0.5348943 ],
       [0.45778674, 0.54221326],
       [0.47750037, 0.52249963],
       [0.50342498, 0.49657502],
       [0.48016639, 0.51983361],
       [0.50461711, 0.49538289],
       [0.50447517, 0.49552483],
       [0.4964663 , 0.5035337 ],
       [0.48929652, 0.51070348],
       [0.48762358, 0.51237642],
       [0.48056255, 0.51943745],
       [0.4958518 , 0.5041482 ],
       [0.51152122, 0.48847878],
       [0.49585715, 0.50414285],
       [0.50828713, 0.49171287],
       [0.50220909, 0.49779091],
       [0.48758917, 0.51241083],
       [0.49959482, 0.50040518],
       [0.48419171, 0.51580829],
       [0.48588431, 0.51411569],
       [0.48269686, 0.51730314],
       [0.47450117, 0.52549883],
       [0.50085397, 0.49914603],
       [0.51277655, 0.48722345],
       [0.51354723, 0.48645277],
       [0.50951274, 0.49048726],
       [0.49502005, 0.50497995],
       [0.49560882, 0.50439118],
       [0.49646433, 0.50353567],
       [0.48743629, 0.51256371],
       [0.49703392, 0.50296608],
       [0.50037515, 0.49962485],
       [0.48461365, 0.51538635],
       [0.49769137, 0.50230863],
       [0.50430808, 0.49569192],
       [0.48433658, 0.51566342],
       [0.48606641, 0.51393359],
       [0.49304173, 0.50695827],
       [0.48872186, 0.51127814],
       [0.49681471, 0.50318529],
       [0.49449886, 0.50550114],
       [0.49247425, 0.50752575],
       [0.49801415, 0.50198585],
       [0.49787272, 0.50212728],
       [0.49943897, 0.50056103],
       [0.50283166, 0.49716834],
       [0.49645027, 0.50354973],
       [0.48832022, 0.51167978],
       [0.48998014, 0.51001986],
       [0.4771957 , 0.5228043 ],
       [0.46940305, 0.53059695],
       [0.48246925, 0.51753075],
       [0.50379432, 0.49620568],
       [0.50009743, 0.49990257],
       [0.48053033, 0.51946967],
       [0.48769528, 0.51230472],
       [0.50707817, 0.49292183],
       [0.49017763, 0.50982237],
       [0.48609992, 0.51390008],
       [0.51084971, 0.48915029],
       [0.51355466, 0.48644534],
       [0.50202175, 0.49797825],
       [0.49568296, 0.50431704],
       [0.49655358, 0.50344642],
       [0.49645901, 0.50354099],
       [0.48557189, 0.51442811],
       [0.4951439 , 0.5048561 ],
       [0.50600481, 0.49399519],
       [0.48806432, 0.51193568],
       [0.49211754, 0.50788246],
       [0.49271947, 0.50728053],
       [0.49016611, 0.50983389],
       [0.5001986 , 0.4998014 ],
       [0.50477457, 0.49522543],
       [0.48752671, 0.51247329],
       [0.48476481, 0.51523519],
       [0.50284047, 0.49715953],
       [0.50084349, 0.49915651],
       [0.48255906, 0.51744094],
       [0.47321244, 0.52678756],
       [0.47977314, 0.52022686],
       [0.49831721, 0.50168279],
       [0.49688235, 0.50311765],
       [0.49970307, 0.50029693],
       [0.49147206, 0.50852794],
       [0.48922997, 0.51077003],
       [0.47876945, 0.52123055],
       [0.47992336, 0.52007664],
       [0.49138178, 0.50861822],
       [0.49162875, 0.50837125],
       [0.49487945, 0.50512055],
       [0.48909001, 0.51090999],
       [0.47909435, 0.52090565],
       [0.4878531 , 0.5121469 ],
       [0.48618381, 0.51381619],
       [0.49355582, 0.50644418],
       [0.49413286, 0.50586714],
       [0.50207617, 0.49792383],
       [0.50430515, 0.49569485],
       [0.48904303, 0.51095697],
       [0.50620061, 0.49379939],
       [0.50927672, 0.49072328],
       [0.48936695, 0.51063305],
       [0.49877757, 0.50122243],
       [0.4997456 , 0.5002544 ],
       [0.48068521, 0.51931479],
       [0.47905361, 0.52094639],
       [0.48894962, 0.51105038],
       [0.50394655, 0.49605345],
       [0.49341736, 0.50658264],
       [0.4748985 , 0.5251015 ],
       [0.4706261 , 0.5293739 ],
       [0.48689783, 0.51310217],
       [0.49675542, 0.50324458],
       [0.49294486, 0.50705514],
       [0.49228531, 0.50771469],
       [0.493369  , 0.506631  ],
       [0.50536007, 0.49463993],
       [0.50305521, 0.49694479],
       [0.49058366, 0.50941634],
       [0.47623902, 0.52376098],
       [0.46033919, 0.53966081],
       [0.46979321, 0.53020679],
       [0.49253001, 0.50746999],
       [0.48611431, 0.51388569],
       [0.48113758, 0.51886242],
       [0.48124736, 0.51875264],
       [0.48423833, 0.51576167],
       [0.50262179, 0.49737821],
       [0.50523122, 0.49476878],
       [0.4813184 , 0.5186816 ],
       [0.50153968, 0.49846032],
       [0.48771613, 0.51228387],
       [0.47741706, 0.52258294],
       [0.51688267, 0.48311733],
       [0.507264  , 0.492736  ],
       [0.48335152, 0.51664848],
       [0.47267015, 0.52732985],
       [0.5032667 , 0.4967333 ],
       [0.52023495, 0.47976505],
       [0.4950279 , 0.5049721 ],
       [0.50187665, 0.49812335],
       [0.50891419, 0.49108581],
       [0.49689113, 0.50310887],
       [0.49515948, 0.50484052],
       [0.4895942 , 0.5104058 ],
       [0.49046532, 0.50953468],
       [0.50553179, 0.49446821],
       [0.50554162, 0.49445838],
       [0.49424704, 0.50575296],
       [0.48574952, 0.51425048],
       [0.49016058, 0.50983942],
       [0.506973  , 0.493027  ],
       [0.50847644, 0.49152356],
       [0.50412876, 0.49587124],
       [0.50482987, 0.49517013],
       [0.50238787, 0.49761213],
       [0.49869029, 0.50130971],
       [0.48247575, 0.51752425],
       [0.48254694, 0.51745306],
       [0.48316002, 0.51683998],
       [0.50174966, 0.49825034],
       [0.50587076, 0.49412924],
       [0.48903208, 0.51096792],
       [0.49110524, 0.50889476],
       [0.48642499, 0.51357501],
       [0.48470615, 0.51529385],
       [0.49448897, 0.50551103],
       [0.49622614, 0.50377386],
       [0.50057022, 0.49942978],
       [0.5039068 , 0.4960932 ],
       [0.49463764, 0.50536236],
       [0.48643657, 0.51356343],
       [0.4807022 , 0.5192978 ],
       [0.48514389, 0.51485611],
       [0.49517341, 0.50482659],
       [0.50058931, 0.49941069],
       [0.497221  , 0.502779  ],
       [0.4791988 , 0.5208012 ],
       [0.48316733, 0.51683267],
       [0.4892591 , 0.5107409 ]])

In [370]:

# BONUS: Simulation in 1D with good separation

class_1 = pd.DataFrame({'class': 0,
                        'x1': np.random.normal(1, 2, 100),
                        'x2' : np.random.normal(2, 2, 100),
                        'x3' : np.random.normal(3, 2, 100)})
class_2 = pd.DataFrame({'class' : 1,
                        'x1': np.random.normal(5, 2, 100),
                        'x2' : np.random.normal(10, 2, 100),
                        'x3' : np.random.normal(15, 2, 100)})

df = pd.concat([class_1, class_2]).sample(frac=1)
lda = LinearDiscriminantAnalysis()
lda.fit(df[['x1', 'x2', 'x3']], df['class'])

LDs = df[['x1', 'x2', 'x3']] @ lda.scalings_
LDs.columns = ['LD1']
LDs['c'] = 0
LDs['class'] = df['class']
_, ax = plt.subplots(figsize=(10,5))
sns.scatterplot(x='LD1', y='c', data=LDs, hue='class', ax=ax)

Out[370]:

<matplotlib.axes._subplots.AxesSubplot at 0x120a98d30>

In [371]:

# BONUS: Simulation in 2D with good separation

class_1 = pd.DataFrame({'class': 0,
                        'x1': np.random.normal(1, 2, 100),
                        'x2' : np.random.normal(2, 2, 100),
                        'x3' : np.random.normal(3, 2, 100)})
class_2 = pd.DataFrame({'class' : 1,
                        'x1': np.random.normal(5, 2, 100),
                        'x2' : np.random.normal(10, 2, 100),
                        'x3' : np.random.normal(15, 2, 100)})
class_3 = pd.DataFrame({'class' : 2,
                        'x1': np.random.normal(10, 2, 100),
                        'x2' : np.random.normal(20, 2, 100),
                        'x3' : np.random.normal(30, 2, 100)})
df = pd.concat([class_1, class_2, class_3]).sample(frac=1)

lda = LinearDiscriminantAnalysis()
lda.fit(df[['x1', 'x2', 'x3']], df['class'])

LDs = df[['x1', 'x2', 'x3']] @ lda.scalings_
LDs.columns = ['LD1', 'LD2']
LDs['class'] = df['class']
_, ax = plt.subplots(figsize=(10,10))
sns.scatterplot(x='LD1', y='LD2', data=LDs, hue='class', ax=ax)

Out[371]:

<matplotlib.axes._subplots.AxesSubplot at 0x120c69ba8>

4.6.4 Quadratic Discriminant Analysis¶

In [386]:

qda = QuadraticDiscriminantAnalysis()
qda.fit(pre[['Lag1', 'Lag2']], pre['Direction'])

# Group means
pd.DataFrame(lda.means_, columns=['Lag1', 'Lag2'], index=['Down', 'Up'])

Out[386]:

	Lag1	Lag2
Down	0.042790	0.033894
Up	-0.039546	-0.031325

In [390]:

preds = qda.predict(post[['Lag1', 'Lag2']])
conf_mtrx = metrics.confusion_matrix(preds, post['Direction'])
print((preds == post['Direction']).mean())
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up'])

0.5992063492063492

Out[390]:

	Down	Up
Down	30	20
Up	81	121

4.6.5 K-Nearest Neighbours¶

In [401]:

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(pre[['Lag1', 'Lag2']], pre['Direction'])
preds = knn.predict(post[['Lag1', 'Lag2']])
print((preds == post['Direction']).mean())
conf_mtrx = metrics.confusion_matrix(preds, post['Direction'])
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up'])

0.5

Out[401]:

	Down	Up
Down	43	58
Up	68	83

In [403]:

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(pre[['Lag1', 'Lag2']], pre['Direction'])
preds = knn.predict(post[['Lag1', 'Lag2']])
print((preds == post['Direction']).mean())
conf_mtrx = metrics.confusion_matrix(preds, post['Direction'])
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up'])

0.5317460317460317

Out[403]:

	Down	Up
Down	48	55
Up	63	86

4.6.6 An Application to Caravan Insurance Data¶

In [10]:

caravan_dat = pd.read_csv('caravan.csv')
caravan_dat = caravan_dat.drop(caravan_dat.columns[0], axis=1)
caravan_dat.head()

Out[10]:

	MOSTYPE	MAANTHUI	MGEMOMV	MGEMLEEF	MOSHOOFD	MGODRK	MGODPR	MGODOV	MGODGE	MRELGE	...	ABRAND	Purchase
0	33	1	3	2	8	0	5	1	3	7	...	1	No
1	37	1	2	2	8	1	4	1	4	6	...	1	No
2	37	1	2	2	8	0	4	2	4	3	...	1	No
3	9	1	3	3	3	2	3	2	4	5	...	1	No
4	40	1	4	2	10	1	4	1	4	7	...	1	No

5 rows × 86 columns

In [17]:

caravan_dat.describe()

Out[17]:

	MOSTYPE	MAANTHUI	MGEMOMV	MGEMLEEF	MOSHOOFD	MGODRK	MGODPR	MGODOV	MGODGE	MRELGE	...	ALEVEN	APERSONG	AGEZONG	AWAOREG	ABRAND	AZEILPL	APLEZIER	AFIETS	AINBOED	ABYSTAND
count	5822.000000	5822.000000	5822.000000	5822.000000	5822.000000	5822.000000	5822.000000	5822.000000	5822.000000	5822.000000	...	5822.000000	5822.000000	5822.000000	5822.000000	5822.000000	5822.000000	5822.000000	5822.000000	5822.000000	5822.000000
mean	24.253349	1.110615	2.678805	2.991240	5.773617	0.696496	4.626932	1.069907	3.258502	6.183442	...	0.076606	0.005325	0.006527	0.004638	0.570079	0.000515	0.006012	0.031776	0.007901	0.014256
std	12.846706	0.405842	0.789835	0.814589	2.856760	1.003234	1.715843	1.017503	1.597647	1.909482	...	0.377569	0.072782	0.080532	0.077403	0.562058	0.022696	0.081632	0.210986	0.090463	0.119996
min	1.000000	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	10.000000	1.000000	2.000000	2.000000	3.000000	0.000000	4.000000	0.000000	2.000000	5.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	30.000000	1.000000	3.000000	3.000000	7.000000	0.000000	5.000000	1.000000	3.000000	6.000000	...	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	35.000000	1.000000	3.000000	3.000000	8.000000	1.000000	6.000000	2.000000	4.000000	7.000000	...	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000
max	41.000000	10.000000	5.000000	6.000000	10.000000	9.000000	9.000000	5.000000	9.000000	9.000000	...	8.000000	1.000000	1.000000	2.000000	7.000000	1.000000	2.000000	3.000000	2.000000	2.000000

8 rows × 85 columns

In [18]:

tmp = caravan_dat.drop('Purchase', axis=1)
caravan_dat_std = (tmp - tmp.mean()) / tmp.std()
caravan_dat_std['Purchase'] = caravan_dat['Purchase'].map({'Yes' : 1, 'No' : 0})
caravan_dat_std.head()

Out[18]:

	MOSTYPE	MAANTHUI	MGEMOMV	MGEMLEEF	MOSHOOFD	MGODRK	MGODPR	MGODOV	MGODGE	MRELGE	...	APERSONG	AGEZONG	AWAOREG	ABRAND	AZEILPL	APLEZIER	AFIETS	AINBOED	ABYSTAND
0	0.680848	-0.272557	0.406662	-1.216859	0.779338	-0.694251	0.217425	-0.068705	-0.161802	0.427633	...	-0.073159	-0.081048	-0.059915	0.764905	-0.022704	-0.073644	-0.150608	-0.08734	-0.118806
1	0.992212	-0.272557	-0.859426	-1.216859	0.779338	0.302526	-0.365379	-0.068705	0.464119	-0.096069	...	-0.073159	-0.081048	-0.059915	0.764905	-0.022704	-0.073644	-0.150608	-0.08734	-0.118806
2	0.992212	-0.272557	-0.859426	-1.216859	0.779338	-0.694251	-0.365379	0.914094	0.464119	-1.667175	...	-0.073159	-0.081048	-0.059915	0.764905	-0.022704	-0.073644	-0.150608	-0.08734	-0.118806
3	-1.187335	-0.272557	0.406662	0.010754	-0.970896	1.299302	-0.948183	0.914094	0.464119	-0.619771	...	-0.073159	-0.081048	-0.059915	0.764905	-0.022704	-0.073644	-0.150608	-0.08734	-0.118806
4	1.225735	-0.272557	1.672750	-1.216859	1.479432	0.302526	-0.365379	-0.068705	0.464119	0.427633	...	-0.073159	-0.081048	-0.059915	0.764905	-0.022704	-0.073644	-0.150608	-0.08734	-0.118806

5 rows × 86 columns

In [38]:

test = caravan_dat_std.iloc[0:1000]
train = caravan_dat_std.iloc[1000:]

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train.drop('Purchase', axis=1), train['Purchase'])
preds = knn.predict(test.drop('Purchase', axis=1))

# knn prediction accuracy with k=1
(preds == test['Purchase']).mean()

Out[38]:

0.882

In [40]:

conf_mtrx = metrics.confusion_matrix(preds, test['Purchase'])
pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes'])

Out[40]:

	No	Yes
No	873	50
Yes	68	9

In [43]:

# naive prediction accuracy
(test['Purchase'] == 0).mean()

Out[43]:

0.941

In [44]:

# knn positive predictive value with k = 1
9 / (68 + 9)

Out[44]:

0.11688311688311688

In [51]:

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train.drop('Purchase', axis=1), train['Purchase'])
preds = knn.predict(test.drop('Purchase', axis=1))
conf_mtrx = metrics.confusion_matrix(preds, test['Purchase'])
pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes'])

Out[51]:

	No	Yes
No	930	55
Yes	11	4

In [53]:

# knn postivie predictive value with k = 5
4 / (11 + 4)

Out[53]:

0.26666666666666666

In [77]:

formula = 'Purchase ~ ' + ' + '.join(train.drop('Purchase', axis=1).columns)
logit_model = smf.logit(formula= formula, data=train).fit()

preds = (logit_model.predict(test) > 0.5)
conf_mtrx = metrics.confusion_matrix(preds, test['Purchase'])
pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes'])

# positive predictive value is zero!

Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.192013
         Iterations: 35

/usr/local/lib/python3.6/site-packages/statsmodels/base/model.py:508: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)

Out[77]:

	No	Yes
No	934	59
Yes	7	0

In [79]:

preds = (logit_model.predict(test) > 0.25)
conf_mtrx = metrics.confusion_matrix(preds, test['Purchase'])
pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes'])

# positive predictive value is (11 / 33) = 1/3

Out[79]:

	No	Yes
No	919	48
Yes	22	11

	MOSTYPE	MAANTHUI	MGEMOMV	MGEMLEEF	MOSHOOFD	MGODRK	MGODPR	MGODOV	MGODGE	MRELGE	...	ABRAND	Purchase
0	33	1	3	2	8	0	5	1	3	7	...	1	No
1	37	1	2	2	8	1	4	1	4	6	...	1	No
2	37	1	2	2	8	0	4	2	4	3	...	1	No
3	9	1	3	3	3	2	3	2	4	5	...	1	No
4	40	1	4	2	10	1	4	1	4	7	...	1	No

	MOSTYPE	MAANTHUI	MGEMOMV	MGEMLEEF	MOSHOOFD	MGODRK	MGODPR	MGODOV	MGODGE	MRELGE	...	ABRAND	Purchase
0	33	1	3	2	8	0	5	1	3	7	...	1	No
1	37	1	2	2	8	1	4	1	4	6	...	1	No
2	37	1	2	2	8	0	4	2	4	3	...	1	No
3	9	1	3	3	3	2	3	2	4	5	...	1	No
4	40	1	4	2	10	1	4	1	4	7	...	1	No

	MOSTYPE	MAANTHUI	MGEMOMV	MGEMLEEF	MOSHOOFD	MGODRK	MGODPR	MGODOV	MGODGE	MRELGE	...	ABRAND	Purchase
0	33	1	3	2	8	0	5	1	3	7	...	1	No
1	37	1	2	2	8	1	4	1	4	6	...	1	No
2	37	1	2	2	8	0	4	2	4	3	...	1	No
3	9	1	3	3	3	2	3	2	4	5	...	1	No
4	40	1	4	2	10	1	4	1	4	7	...	1	No