import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import sklearn.metrics as metrics
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
df = pd.read_csv('smarket.csv')
smarket_dat = df.drop(df.columns[0], axis=1)
smarket_dat['Direction'] = smarket_dat['Direction'].map({'Up': 1, 'Down': 0})
smarket_dat.head()
Year | Lag1 | Lag2 | Lag3 | Lag4 | Lag5 | Volume | Today | Direction | |
---|---|---|---|---|---|---|---|---|---|
0 | 2001 | 0.381 | -0.192 | -2.624 | -1.055 | 5.010 | 1.1913 | 0.959 | 1 |
1 | 2001 | 0.959 | 0.381 | -0.192 | -2.624 | -1.055 | 1.2965 | 1.032 | 1 |
2 | 2001 | 1.032 | 0.959 | 0.381 | -0.192 | -2.624 | 1.4112 | -0.623 | 0 |
3 | 2001 | -0.623 | 1.032 | 0.959 | 0.381 | -0.192 | 1.2760 | 0.614 | 1 |
4 | 2001 | 0.614 | -0.623 | 1.032 | 0.959 | 0.381 | 1.2057 | 0.213 | 1 |
smarket_dat.describe()
Year | Lag1 | Lag2 | Lag3 | Lag4 | Lag5 | Volume | Today | Direction | |
---|---|---|---|---|---|---|---|---|---|
count | 1250.000000 | 1250.000000 | 1250.000000 | 1250.000000 | 1250.000000 | 1250.00000 | 1250.000000 | 1250.000000 | 1250.000000 |
mean | 2003.016000 | 0.003834 | 0.003919 | 0.001716 | 0.001636 | 0.00561 | 1.478305 | 0.003138 | 0.518400 |
std | 1.409018 | 1.136299 | 1.136280 | 1.138703 | 1.138774 | 1.14755 | 0.360357 | 1.136334 | 0.499861 |
min | 2001.000000 | -4.922000 | -4.922000 | -4.922000 | -4.922000 | -4.92200 | 0.356070 | -4.922000 | 0.000000 |
25% | 2002.000000 | -0.639500 | -0.639500 | -0.640000 | -0.640000 | -0.64000 | 1.257400 | -0.639500 | 0.000000 |
50% | 2003.000000 | 0.039000 | 0.039000 | 0.038500 | 0.038500 | 0.03850 | 1.422950 | 0.038500 | 1.000000 |
75% | 2004.000000 | 0.596750 | 0.596750 | 0.596750 | 0.596750 | 0.59700 | 1.641675 | 0.596750 | 1.000000 |
max | 2005.000000 | 5.733000 | 5.733000 | 5.733000 | 5.733000 | 5.73300 | 3.152470 | 5.733000 | 1.000000 |
_, ax = plt.subplots(figsize=(10,10))
sns.heatmap(smarket_dat.corr(), annot=True, ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x11a63f7b8>
_, ax = plt.subplots(figsize=(10,10))
sns.regplot(ax=ax, x='Year', y='Volume', data=smarket_dat)
<matplotlib.axes._subplots.AxesSubplot at 0x11a0c9f98>
fig, ax = plt.subplots(figsize=(10,10))
sns.regplot(ax=ax, x='index', y='Volume', data=smarket_dat.reset_index())
<matplotlib.axes._subplots.AxesSubplot at 0x11a8b7ef0>
logit_model = smf.logit(formula='Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume', data=smarket_dat).fit()
logit_model.summary()
Optimization terminated successfully. Current function value: 0.691034 Iterations 4
Dep. Variable: | Direction | No. Observations: | 1250 |
---|---|---|---|
Model: | Logit | Df Residuals: | 1243 |
Method: | MLE | Df Model: | 6 |
Date: | Wed, 19 Sep 2018 | Pseudo R-squ.: | 0.002074 |
Time: | 17:58:15 | Log-Likelihood: | -863.79 |
converged: | True | LL-Null: | -865.59 |
LLR p-value: | 0.7319 |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -0.1260 | 0.241 | -0.523 | 0.601 | -0.598 | 0.346 |
Lag1 | -0.0731 | 0.050 | -1.457 | 0.145 | -0.171 | 0.025 |
Lag2 | -0.0423 | 0.050 | -0.845 | 0.398 | -0.140 | 0.056 |
Lag3 | 0.0111 | 0.050 | 0.222 | 0.824 | -0.087 | 0.109 |
Lag4 | 0.0094 | 0.050 | 0.187 | 0.851 | -0.089 | 0.107 |
Lag5 | 0.0103 | 0.050 | 0.208 | 0.835 | -0.087 | 0.107 |
Volume | 0.1354 | 0.158 | 0.855 | 0.392 | -0.175 | 0.446 |
pred = logit_model.predict(smarket_dat).map(lambda x: 1 if x > 0.5 else 0)
conf_mtrx = metrics.confusion_matrix(smarket_dat['Direction'], pred)
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index= ['Down', 'Up']).T
Down | Up | |
---|---|---|
Down | 145 | 141 |
Up | 457 | 507 |
(145 + 507) / (145 + 141 + 457 + 507)
0.5216
(pred == smarket_dat['Direction']).sum() / smarket_dat.shape[0]
0.5216
(pred == smarket_dat['Direction']).mean()
0.5216
year_mask = smarket_dat['Year'] < 2005
pre = smarket_dat[year_mask]
post = smarket_dat[year_mask == False]
logit_model = smf.logit(formula='Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume', data=pre).fit()
pred = logit_model.predict(post).map(lambda x: 1 if x > 0.5 else 0)
conf_mtrx = metrics.confusion_matrix(post['Direction'], pred)
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index= ['Down', 'Up']).T
Optimization terminated successfully. Current function value: 0.691936 Iterations 4
Down | Up | |
---|---|---|
Down | 77 | 97 |
Up | 34 | 44 |
(pred == post['Direction']).mean()
0.4801587301587302
year_mask = smarket_dat['Year'] < 2005
pre = smarket_dat[year_mask]
post = smarket_dat[year_mask == False]
logit_model = smf.logit(formula='Direction ~ Lag1 + Lag2', data=pre).fit()
pred = logit_model.predict(post).map(lambda x: 1 if x > 0.5 else 0)
conf_mtrx = metrics.confusion_matrix(post['Direction'], pred)
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index= ['Down', 'Up']).T
Optimization terminated successfully. Current function value: 0.692085 Iterations 3
Down | Up | |
---|---|---|
Down | 35 | 35 |
Up | 76 | 106 |
(pred == post['Direction']).mean()
0.5595238095238095
logit_model.predict(pd.DataFrame({'Lag1' : [1.2, 1.5], 'Lag2' : [1.1, -0.8]}))
0 0.479146 1 0.496094 dtype: float64
lda = LinearDiscriminantAnalysis()
lda.fit(pre[['Lag1', 'Lag2']], pre['Direction'])
# Group means
pd.DataFrame(lda.means_, columns=['Lag1', 'Lag2'], index=['Down', 'Up'])
Lag1 | Lag2 | |
---|---|---|
Down | 0.042790 | 0.033894 |
Up | -0.039546 | -0.031325 |
# Priors
pd.DataFrame(lda.priors_, index= ['Down', 'Up'], columns=['Prior'])
Prior | |
---|---|
Down | 0.491984 |
Up | 0.508016 |
# Coefficients - NB: sklearn calls these `scalings`
pd.DataFrame(lda.scalings_, columns=['LD1'], index=['Lag1', 'Lag2'])
LD1 | |
---|---|
Lag1 | -0.642019 |
Lag2 | -0.513529 |
LD1 = pre[['Lag1', 'Lag2']] @ lda.scalings_
LD1.columns = ['LD1']
LD1['Direction'] = pre['Direction']
LD1['c'] = 0
_, ax = plt.subplots(figsize=(10,5))
sns.scatterplot('LD1', y='c', data=LD1, hue='Direction', ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x120d86a58>
# Prediction accuracy and confusion matrix
preds = lda.predict(post[['Lag1', 'Lag2']])
conf_mtrx = metrics.confusion_matrix(preds, post['Direction'])
print((preds == post['Direction']).mean())
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up'])
0.5595238095238095
Down | Up | |
---|---|---|
Down | 35 | 35 |
Up | 76 | 106 |
# Get class probabilities
lda.predict_proba(post[['Lag1', 'Lag2']])
array([[0.49017925, 0.50982075], [0.4792185 , 0.5207815 ], [0.46681848, 0.53318152], [0.47400107, 0.52599893], [0.49278766, 0.50721234], [0.49385615, 0.50614385], [0.49510156, 0.50489844], [0.4872861 , 0.5127139 ], [0.49070135, 0.50929865], [0.48440262, 0.51559738], [0.49069628, 0.50930372], [0.51199885, 0.48800115], [0.48951523, 0.51048477], [0.47067612, 0.52932388], [0.47445929, 0.52554071], [0.47995834, 0.52004166], [0.49357753, 0.50642247], [0.50308938, 0.49691062], [0.49788061, 0.50211939], [0.48863309, 0.51136691], [0.50065681, 0.49934319], [0.51087353, 0.48912647], [0.50399248, 0.49600752], [0.49163351, 0.50836649], [0.50417721, 0.49582279], [0.50267505, 0.49732495], [0.49140429, 0.50859571], [0.48059641, 0.51940359], [0.48827181, 0.51172819], [0.50621869, 0.49378131], [0.50059958, 0.49940042], [0.49729649, 0.50270351], [0.49585462, 0.50414538], [0.48117774, 0.51882226], [0.48414175, 0.51585825], [0.47263882, 0.52736118], [0.48364175, 0.51635825], [0.50910066, 0.49089934], [0.51359414, 0.48640586], [0.49338391, 0.50661609], [0.49268564, 0.50731436], [0.4978472 , 0.5021528 ], [0.49209142, 0.50790858], [0.50563459, 0.49436541], [0.50622877, 0.49377123], [0.48818939, 0.51181061], [0.47252929, 0.52747071], [0.48323391, 0.51676609], [0.48350857, 0.51649143], [0.49133344, 0.50866656], [0.48775664, 0.51224336], [0.47243859, 0.52756141], [0.48548774, 0.51451226], [0.49329107, 0.50670893], [0.48459731, 0.51540269], [0.47237179, 0.52762821], [0.48161704, 0.51838296], [0.49140673, 0.50859327], [0.4942755 , 0.5057245 ], [0.48412321, 0.51587679], [0.50260644, 0.49739356], [0.50625572, 0.49374428], [0.48218003, 0.51781997], [0.48852631, 0.51147369], [0.50118249, 0.49881751], [0.50005949, 0.49994051], [0.50273766, 0.49726234], [0.48700861, 0.51299139], [0.48272133, 0.51727867], [0.49965006, 0.50034994], [0.4818079 , 0.5181921 ], [0.4651057 , 0.5348943 ], [0.45778674, 0.54221326], [0.47750037, 0.52249963], [0.50342498, 0.49657502], [0.48016639, 0.51983361], [0.50461711, 0.49538289], [0.50447517, 0.49552483], [0.4964663 , 0.5035337 ], [0.48929652, 0.51070348], [0.48762358, 0.51237642], [0.48056255, 0.51943745], [0.4958518 , 0.5041482 ], [0.51152122, 0.48847878], [0.49585715, 0.50414285], [0.50828713, 0.49171287], [0.50220909, 0.49779091], [0.48758917, 0.51241083], [0.49959482, 0.50040518], [0.48419171, 0.51580829], [0.48588431, 0.51411569], [0.48269686, 0.51730314], [0.47450117, 0.52549883], [0.50085397, 0.49914603], [0.51277655, 0.48722345], [0.51354723, 0.48645277], [0.50951274, 0.49048726], [0.49502005, 0.50497995], [0.49560882, 0.50439118], [0.49646433, 0.50353567], [0.48743629, 0.51256371], [0.49703392, 0.50296608], [0.50037515, 0.49962485], [0.48461365, 0.51538635], [0.49769137, 0.50230863], [0.50430808, 0.49569192], [0.48433658, 0.51566342], [0.48606641, 0.51393359], [0.49304173, 0.50695827], [0.48872186, 0.51127814], [0.49681471, 0.50318529], [0.49449886, 0.50550114], [0.49247425, 0.50752575], [0.49801415, 0.50198585], [0.49787272, 0.50212728], [0.49943897, 0.50056103], [0.50283166, 0.49716834], [0.49645027, 0.50354973], [0.48832022, 0.51167978], [0.48998014, 0.51001986], [0.4771957 , 0.5228043 ], [0.46940305, 0.53059695], [0.48246925, 0.51753075], [0.50379432, 0.49620568], [0.50009743, 0.49990257], [0.48053033, 0.51946967], [0.48769528, 0.51230472], [0.50707817, 0.49292183], [0.49017763, 0.50982237], [0.48609992, 0.51390008], [0.51084971, 0.48915029], [0.51355466, 0.48644534], [0.50202175, 0.49797825], [0.49568296, 0.50431704], [0.49655358, 0.50344642], [0.49645901, 0.50354099], [0.48557189, 0.51442811], [0.4951439 , 0.5048561 ], [0.50600481, 0.49399519], [0.48806432, 0.51193568], [0.49211754, 0.50788246], [0.49271947, 0.50728053], [0.49016611, 0.50983389], [0.5001986 , 0.4998014 ], [0.50477457, 0.49522543], [0.48752671, 0.51247329], [0.48476481, 0.51523519], [0.50284047, 0.49715953], [0.50084349, 0.49915651], [0.48255906, 0.51744094], [0.47321244, 0.52678756], [0.47977314, 0.52022686], [0.49831721, 0.50168279], [0.49688235, 0.50311765], [0.49970307, 0.50029693], [0.49147206, 0.50852794], [0.48922997, 0.51077003], [0.47876945, 0.52123055], [0.47992336, 0.52007664], [0.49138178, 0.50861822], [0.49162875, 0.50837125], [0.49487945, 0.50512055], [0.48909001, 0.51090999], [0.47909435, 0.52090565], [0.4878531 , 0.5121469 ], [0.48618381, 0.51381619], [0.49355582, 0.50644418], [0.49413286, 0.50586714], [0.50207617, 0.49792383], [0.50430515, 0.49569485], [0.48904303, 0.51095697], [0.50620061, 0.49379939], [0.50927672, 0.49072328], [0.48936695, 0.51063305], [0.49877757, 0.50122243], [0.4997456 , 0.5002544 ], [0.48068521, 0.51931479], [0.47905361, 0.52094639], [0.48894962, 0.51105038], [0.50394655, 0.49605345], [0.49341736, 0.50658264], [0.4748985 , 0.5251015 ], [0.4706261 , 0.5293739 ], [0.48689783, 0.51310217], [0.49675542, 0.50324458], [0.49294486, 0.50705514], [0.49228531, 0.50771469], [0.493369 , 0.506631 ], [0.50536007, 0.49463993], [0.50305521, 0.49694479], [0.49058366, 0.50941634], [0.47623902, 0.52376098], [0.46033919, 0.53966081], [0.46979321, 0.53020679], [0.49253001, 0.50746999], [0.48611431, 0.51388569], [0.48113758, 0.51886242], [0.48124736, 0.51875264], [0.48423833, 0.51576167], [0.50262179, 0.49737821], [0.50523122, 0.49476878], [0.4813184 , 0.5186816 ], [0.50153968, 0.49846032], [0.48771613, 0.51228387], [0.47741706, 0.52258294], [0.51688267, 0.48311733], [0.507264 , 0.492736 ], [0.48335152, 0.51664848], [0.47267015, 0.52732985], [0.5032667 , 0.4967333 ], [0.52023495, 0.47976505], [0.4950279 , 0.5049721 ], [0.50187665, 0.49812335], [0.50891419, 0.49108581], [0.49689113, 0.50310887], [0.49515948, 0.50484052], [0.4895942 , 0.5104058 ], [0.49046532, 0.50953468], [0.50553179, 0.49446821], [0.50554162, 0.49445838], [0.49424704, 0.50575296], [0.48574952, 0.51425048], [0.49016058, 0.50983942], [0.506973 , 0.493027 ], [0.50847644, 0.49152356], [0.50412876, 0.49587124], [0.50482987, 0.49517013], [0.50238787, 0.49761213], [0.49869029, 0.50130971], [0.48247575, 0.51752425], [0.48254694, 0.51745306], [0.48316002, 0.51683998], [0.50174966, 0.49825034], [0.50587076, 0.49412924], [0.48903208, 0.51096792], [0.49110524, 0.50889476], [0.48642499, 0.51357501], [0.48470615, 0.51529385], [0.49448897, 0.50551103], [0.49622614, 0.50377386], [0.50057022, 0.49942978], [0.5039068 , 0.4960932 ], [0.49463764, 0.50536236], [0.48643657, 0.51356343], [0.4807022 , 0.5192978 ], [0.48514389, 0.51485611], [0.49517341, 0.50482659], [0.50058931, 0.49941069], [0.497221 , 0.502779 ], [0.4791988 , 0.5208012 ], [0.48316733, 0.51683267], [0.4892591 , 0.5107409 ]])
# BONUS: Simulation in 1D with good separation
class_1 = pd.DataFrame({'class': 0,
'x1': np.random.normal(1, 2, 100),
'x2' : np.random.normal(2, 2, 100),
'x3' : np.random.normal(3, 2, 100)})
class_2 = pd.DataFrame({'class' : 1,
'x1': np.random.normal(5, 2, 100),
'x2' : np.random.normal(10, 2, 100),
'x3' : np.random.normal(15, 2, 100)})
df = pd.concat([class_1, class_2]).sample(frac=1)
lda = LinearDiscriminantAnalysis()
lda.fit(df[['x1', 'x2', 'x3']], df['class'])
LDs = df[['x1', 'x2', 'x3']] @ lda.scalings_
LDs.columns = ['LD1']
LDs['c'] = 0
LDs['class'] = df['class']
_, ax = plt.subplots(figsize=(10,5))
sns.scatterplot(x='LD1', y='c', data=LDs, hue='class', ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x120a98d30>
# BONUS: Simulation in 2D with good separation
class_1 = pd.DataFrame({'class': 0,
'x1': np.random.normal(1, 2, 100),
'x2' : np.random.normal(2, 2, 100),
'x3' : np.random.normal(3, 2, 100)})
class_2 = pd.DataFrame({'class' : 1,
'x1': np.random.normal(5, 2, 100),
'x2' : np.random.normal(10, 2, 100),
'x3' : np.random.normal(15, 2, 100)})
class_3 = pd.DataFrame({'class' : 2,
'x1': np.random.normal(10, 2, 100),
'x2' : np.random.normal(20, 2, 100),
'x3' : np.random.normal(30, 2, 100)})
df = pd.concat([class_1, class_2, class_3]).sample(frac=1)
lda = LinearDiscriminantAnalysis()
lda.fit(df[['x1', 'x2', 'x3']], df['class'])
LDs = df[['x1', 'x2', 'x3']] @ lda.scalings_
LDs.columns = ['LD1', 'LD2']
LDs['class'] = df['class']
_, ax = plt.subplots(figsize=(10,10))
sns.scatterplot(x='LD1', y='LD2', data=LDs, hue='class', ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x120c69ba8>
qda = QuadraticDiscriminantAnalysis()
qda.fit(pre[['Lag1', 'Lag2']], pre['Direction'])
# Group means
pd.DataFrame(lda.means_, columns=['Lag1', 'Lag2'], index=['Down', 'Up'])
Lag1 | Lag2 | |
---|---|---|
Down | 0.042790 | 0.033894 |
Up | -0.039546 | -0.031325 |
preds = qda.predict(post[['Lag1', 'Lag2']])
conf_mtrx = metrics.confusion_matrix(preds, post['Direction'])
print((preds == post['Direction']).mean())
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up'])
0.5992063492063492
Down | Up | |
---|---|---|
Down | 30 | 20 |
Up | 81 | 121 |
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(pre[['Lag1', 'Lag2']], pre['Direction'])
preds = knn.predict(post[['Lag1', 'Lag2']])
print((preds == post['Direction']).mean())
conf_mtrx = metrics.confusion_matrix(preds, post['Direction'])
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up'])
0.5
Down | Up | |
---|---|---|
Down | 43 | 58 |
Up | 68 | 83 |
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(pre[['Lag1', 'Lag2']], pre['Direction'])
preds = knn.predict(post[['Lag1', 'Lag2']])
print((preds == post['Direction']).mean())
conf_mtrx = metrics.confusion_matrix(preds, post['Direction'])
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up'])
0.5317460317460317
Down | Up | |
---|---|---|
Down | 48 | 55 |
Up | 63 | 86 |
caravan_dat = pd.read_csv('caravan.csv')
caravan_dat = caravan_dat.drop(caravan_dat.columns[0], axis=1)
caravan_dat.head()
MOSTYPE | MAANTHUI | MGEMOMV | MGEMLEEF | MOSHOOFD | MGODRK | MGODPR | MGODOV | MGODGE | MRELGE | ... | APERSONG | AGEZONG | AWAOREG | ABRAND | AZEILPL | APLEZIER | AFIETS | AINBOED | ABYSTAND | Purchase | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 33 | 1 | 3 | 2 | 8 | 0 | 5 | 1 | 3 | 7 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | No |
1 | 37 | 1 | 2 | 2 | 8 | 1 | 4 | 1 | 4 | 6 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | No |
2 | 37 | 1 | 2 | 2 | 8 | 0 | 4 | 2 | 4 | 3 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | No |
3 | 9 | 1 | 3 | 3 | 3 | 2 | 3 | 2 | 4 | 5 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | No |
4 | 40 | 1 | 4 | 2 | 10 | 1 | 4 | 1 | 4 | 7 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | No |
5 rows × 86 columns
caravan_dat.describe()
MOSTYPE | MAANTHUI | MGEMOMV | MGEMLEEF | MOSHOOFD | MGODRK | MGODPR | MGODOV | MGODGE | MRELGE | ... | ALEVEN | APERSONG | AGEZONG | AWAOREG | ABRAND | AZEILPL | APLEZIER | AFIETS | AINBOED | ABYSTAND | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 5822.000000 | 5822.000000 | 5822.000000 | 5822.000000 | 5822.000000 | 5822.000000 | 5822.000000 | 5822.000000 | 5822.000000 | 5822.000000 | ... | 5822.000000 | 5822.000000 | 5822.000000 | 5822.000000 | 5822.000000 | 5822.000000 | 5822.000000 | 5822.000000 | 5822.000000 | 5822.000000 |
mean | 24.253349 | 1.110615 | 2.678805 | 2.991240 | 5.773617 | 0.696496 | 4.626932 | 1.069907 | 3.258502 | 6.183442 | ... | 0.076606 | 0.005325 | 0.006527 | 0.004638 | 0.570079 | 0.000515 | 0.006012 | 0.031776 | 0.007901 | 0.014256 |
std | 12.846706 | 0.405842 | 0.789835 | 0.814589 | 2.856760 | 1.003234 | 1.715843 | 1.017503 | 1.597647 | 1.909482 | ... | 0.377569 | 0.072782 | 0.080532 | 0.077403 | 0.562058 | 0.022696 | 0.081632 | 0.210986 | 0.090463 | 0.119996 |
min | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 10.000000 | 1.000000 | 2.000000 | 2.000000 | 3.000000 | 0.000000 | 4.000000 | 0.000000 | 2.000000 | 5.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 30.000000 | 1.000000 | 3.000000 | 3.000000 | 7.000000 | 0.000000 | 5.000000 | 1.000000 | 3.000000 | 6.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 35.000000 | 1.000000 | 3.000000 | 3.000000 | 8.000000 | 1.000000 | 6.000000 | 2.000000 | 4.000000 | 7.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
max | 41.000000 | 10.000000 | 5.000000 | 6.000000 | 10.000000 | 9.000000 | 9.000000 | 5.000000 | 9.000000 | 9.000000 | ... | 8.000000 | 1.000000 | 1.000000 | 2.000000 | 7.000000 | 1.000000 | 2.000000 | 3.000000 | 2.000000 | 2.000000 |
8 rows × 85 columns
tmp = caravan_dat.drop('Purchase', axis=1)
caravan_dat_std = (tmp - tmp.mean()) / tmp.std()
caravan_dat_std['Purchase'] = caravan_dat['Purchase'].map({'Yes' : 1, 'No' : 0})
caravan_dat_std.head()
MOSTYPE | MAANTHUI | MGEMOMV | MGEMLEEF | MOSHOOFD | MGODRK | MGODPR | MGODOV | MGODGE | MRELGE | ... | APERSONG | AGEZONG | AWAOREG | ABRAND | AZEILPL | APLEZIER | AFIETS | AINBOED | ABYSTAND | Purchase | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.680848 | -0.272557 | 0.406662 | -1.216859 | 0.779338 | -0.694251 | 0.217425 | -0.068705 | -0.161802 | 0.427633 | ... | -0.073159 | -0.081048 | -0.059915 | 0.764905 | -0.022704 | -0.073644 | -0.150608 | -0.08734 | -0.118806 | 0 |
1 | 0.992212 | -0.272557 | -0.859426 | -1.216859 | 0.779338 | 0.302526 | -0.365379 | -0.068705 | 0.464119 | -0.096069 | ... | -0.073159 | -0.081048 | -0.059915 | 0.764905 | -0.022704 | -0.073644 | -0.150608 | -0.08734 | -0.118806 | 0 |
2 | 0.992212 | -0.272557 | -0.859426 | -1.216859 | 0.779338 | -0.694251 | -0.365379 | 0.914094 | 0.464119 | -1.667175 | ... | -0.073159 | -0.081048 | -0.059915 | 0.764905 | -0.022704 | -0.073644 | -0.150608 | -0.08734 | -0.118806 | 0 |
3 | -1.187335 | -0.272557 | 0.406662 | 0.010754 | -0.970896 | 1.299302 | -0.948183 | 0.914094 | 0.464119 | -0.619771 | ... | -0.073159 | -0.081048 | -0.059915 | 0.764905 | -0.022704 | -0.073644 | -0.150608 | -0.08734 | -0.118806 | 0 |
4 | 1.225735 | -0.272557 | 1.672750 | -1.216859 | 1.479432 | 0.302526 | -0.365379 | -0.068705 | 0.464119 | 0.427633 | ... | -0.073159 | -0.081048 | -0.059915 | 0.764905 | -0.022704 | -0.073644 | -0.150608 | -0.08734 | -0.118806 | 0 |
5 rows × 86 columns
test = caravan_dat_std.iloc[0:1000]
train = caravan_dat_std.iloc[1000:]
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train.drop('Purchase', axis=1), train['Purchase'])
preds = knn.predict(test.drop('Purchase', axis=1))
# knn prediction accuracy with k=1
(preds == test['Purchase']).mean()
0.882
conf_mtrx = metrics.confusion_matrix(preds, test['Purchase'])
pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes'])
No | Yes | |
---|---|---|
No | 873 | 50 |
Yes | 68 | 9 |
# naive prediction accuracy
(test['Purchase'] == 0).mean()
0.941
# knn positive predictive value with k = 1
9 / (68 + 9)
0.11688311688311688
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train.drop('Purchase', axis=1), train['Purchase'])
preds = knn.predict(test.drop('Purchase', axis=1))
conf_mtrx = metrics.confusion_matrix(preds, test['Purchase'])
pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes'])
No | Yes | |
---|---|---|
No | 930 | 55 |
Yes | 11 | 4 |
# knn postivie predictive value with k = 5
4 / (11 + 4)
0.26666666666666666
formula = 'Purchase ~ ' + ' + '.join(train.drop('Purchase', axis=1).columns)
logit_model = smf.logit(formula= formula, data=train).fit()
preds = (logit_model.predict(test) > 0.5)
conf_mtrx = metrics.confusion_matrix(preds, test['Purchase'])
pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes'])
# positive predictive value is zero!
Warning: Maximum number of iterations has been exceeded. Current function value: 0.192013 Iterations: 35
/usr/local/lib/python3.6/site-packages/statsmodels/base/model.py:508: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals "Check mle_retvals", ConvergenceWarning)
No | Yes | |
---|---|---|
No | 934 | 59 |
Yes | 7 | 0 |
preds = (logit_model.predict(test) > 0.25)
conf_mtrx = metrics.confusion_matrix(preds, test['Purchase'])
pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes'])
# positive predictive value is (11 / 33) = 1/3
No | Yes | |
---|---|---|
No | 919 | 48 |
Yes | 22 | 11 |