4.1 An Overview of Classification
4.4 Linear Discriminant Analysis
4.5 A Comparison of Classification Methods
4.6 Lab: Logistic Regression, LDA, QDA, and KNN
4.6.1 The Stock Market Data
4.6.2 Logistic regression
4.6.3 Linear Discriminant Analysis (LDA)
4.6.4 Quadratic Discriminant Analysis (QDA)
4.6.5 K-Nearest Neighbors
4.6.6 An Application to Caravan Insurance Data
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score, roc_curve, auc, log_loss
from sklearn import preprocessing
from sklearn import neighbors
from scipy import stats
import scikitplot as skplt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from ipywidgets import widgets
from classification_helper import print_classification_statistics, plot_ROC, print_OLS_error_table, plot_classification
%matplotlib inline
plt.style.use('seaborn-white')
# In R, I exported the dataset from package 'ISLR' to an Excel file
df_default = pd.read_excel('Data/Default.xlsx')
# Note: factorize() returns two objects: a label array and an array with the unique values.
# We are only interested in the first object.
df_default['default2'] = df_default.default.factorize()[0]
df_default['student2'] = df_default.student.factorize()[0]
df_default.head(3)
default | student | balance | income | default2 | student2 | |
---|---|---|---|---|---|---|
1 | No | No | 729.526495 | 44361.625074 | 0 | 0 |
2 | No | Yes | 817.180407 | 12106.134700 | 0 | 1 |
3 | No | No | 1073.549164 | 31767.138947 | 0 | 0 |
fig = plt.figure(figsize=(12,5))
gs = mpl.gridspec.GridSpec(1, 4)
ax1 = plt.subplot(gs[0,:-2])
ax2 = plt.subplot(gs[0,-2])
ax3 = plt.subplot(gs[0,-1])
# Take a fraction of the samples where target value (default) is 'no'
df_no = df_default[df_default.default2 == 0].sample(frac=.08)
# Take all samples where target value is 'yes'
df_yes = df_default[df_default.default2 == 1]
ax1.scatter(df_yes.balance, df_yes.income, s=40, c='orange', marker='+', linewidths=1)
ax1.scatter(df_no.balance, df_no.income, s=40, marker='o', linewidths='1',
edgecolors='lightblue', facecolors='white', alpha=.6)
ax1.set_ylim(ymin=0)
ax1.set_ylabel('Income')
ax1.set_xlim(xmin=-100)
ax1.set_xlabel('Balance')
c_palette = {'No':'lightblue', 'Yes':'orange'}
sns.boxplot('default', 'balance', data=df_default, orient='v', ax=ax2, palette=c_palette)
sns.boxplot('default', 'income', data=df_default, orient='v', ax=ax3, palette=c_palette)
gs.tight_layout(plt.gcf())
X_train = df_default.balance.values.reshape(-1,1)
y = df_default.default2
# Create array of test data. Calculate the classification probability
# and predicted classification.
X_plot = np.arange(df_default.balance.min(), df_default.balance.max()).reshape(-1,1)
clf = skl_lm.LogisticRegression(solver='newton-cg')
clf.fit(X_train,y)
prob = clf.predict_proba(X_plot)
fig = plt.figure()
ax = plt.axes()
# Right plot
ax.scatter(X_train, y, color='orange')
ax.plot(X_plot, prob[:,1], color='lightblue')
ax.hlines(1, xmin=ax.xaxis.get_data_interval()[0],
xmax=ax.xaxis.get_data_interval()[1], linestyles='dashed', lw=1)
ax.hlines(0, xmin=ax.xaxis.get_data_interval()[0],
xmax=ax.xaxis.get_data_interval()[1], linestyles='dashed', lw=1)
ax.set_ylabel('Probability of default')
ax.set_xlabel('Balance')
ax.set_yticks([0, 0.25, 0.5, 0.75, 1.])
ax.set_xlim(xmin=-100)
print(clf)
print('classes: ', clf.classes_)
print('coefficients: ', [*clf.intercept_, *clf.coef_.tolist()[0]])
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='newton-cg', tol=0.0001, verbose=0, warm_start=False) classes: [0 1] coefficients: [-10.651330005794106, 0.0054989165568046445]
X_train = sm.add_constant(df_default.balance)
y = df_default.default2
est = smf.Logit(y.ravel(), X_train).fit()
est.summary2().tables[1]
Optimization terminated successfully. Current function value: 0.079823 Iterations 10
Coef. | Std.Err. | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -10.651331 | 0.361169 | -29.491287 | 3.723665e-191 | -11.359208 | -9.943453 |
balance | 0.005499 | 0.000220 | 24.952404 | 2.010855e-137 | 0.005067 | 0.005931 |
X_train = sm.add_constant(df_default.student2)
y = df_default.default2
est = smf.Logit(y, X_train).fit()
est.summary2().tables[1]
Optimization terminated successfully. Current function value: 0.145434 Iterations 7
Coef. | Std.Err. | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -3.504128 | 0.070713 | -49.554094 | 0.000000 | -3.642723 | -3.365532 |
student2 | 0.404887 | 0.115019 | 3.520177 | 0.000431 | 0.179454 | 0.630320 |
X_train = sm.add_constant(df_default[['balance', 'income', 'student2']])
est = smf.Logit(y, X_train).fit()
est.summary2()
Optimization terminated successfully. Current function value: 0.078577 Iterations 10
Model: | Logit | Pseudo R-squared: | 0.462 |
Dependent Variable: | default2 | AIC: | 1579.5448 |
Date: | 2018-06-23 13:11 | BIC: | 1608.3862 |
No. Observations: | 10000 | Log-Likelihood: | -785.77 |
Df Model: | 3 | LL-Null: | -1460.3 |
Df Residuals: | 9996 | LLR p-value: | 3.2575e-292 |
Converged: | 1.0000 | Scale: | 1.0000 |
No. Iterations: | 10.0000 |
Coef. | Std.Err. | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -10.8690 | 0.4923 | -22.0793 | 0.0000 | -11.8339 | -9.9042 |
balance | 0.0057 | 0.0002 | 24.7365 | 0.0000 | 0.0053 | 0.0062 |
income | 0.0000 | 0.0000 | 0.3698 | 0.7115 | -0.0000 | 0.0000 |
student2 | -0.6468 | 0.2363 | -2.7376 | 0.0062 | -1.1098 | -0.1837 |
def generate_LDA(mean1=-2, mean2=1, mean3=2, sigma1=0.5, sigma2=0.5):
mean1 = mean1*np.array([1, 1])
mean2 = mean2*np.array([1, 1])
mean3 = mean3*np.array([1, -1])
cov = np.array([[sigma1, 0], [0, sigma2]])
N = 500
K = 3
# if you sample from a t-distribution, the LDA results are really bad
def multivariate_t(means, S, N):
df = 1
m = np.asarray(means)
d = len(means)
x = np.random.chisquare(df, N[0])/df
z = np.random.multivariate_normal(np.zeros(d), S, N)
return m + z/np.sqrt(x)[:,None]
sample1 = np.random.multivariate_normal(mean1, cov, (N,))
sample2 = np.random.multivariate_normal(mean2, cov, (N,))
sample3 = np.random.multivariate_normal(mean3, cov, (N,))
maxX = np.max([sample1[:,0], sample2[:,0], sample3[:,0]])
minX = np.min([sample1[:,0], sample2[:,0], sample3[:,0]])
maxY = np.max([sample1[:,1], sample2[:,1], sample3[:,1]])
minY = np.min([sample1[:,1], sample2[:,1], sample3[:,1]])
# priors
pi1 = pi2 = pi3 = N/K
# grid of points to plot the bayes and LDA regions/lines
N_points_grid = 200
xx, yy = np.meshgrid(np.linspace(minX, maxX, N_points_grid), np.linspace(minY, maxY, N_points_grid))
X = np.c_[xx.ravel(), yy.ravel()]
# Bayes regions
inv_cov = np.linalg.inv(cov)
delta1_fun = lambda X: np.dot(X, np.dot(inv_cov, mean1)) - 1/2*np.dot(mean1.T, np.dot(inv_cov, mean1)) + np.log(pi1)
delta2_fun = lambda X: np.dot(X, np.dot(inv_cov, mean2)) - 1/2*np.dot(mean2.T, np.dot(inv_cov, mean2)) + np.log(pi2)
delta3_fun = lambda X: np.dot(X, np.dot(inv_cov, mean3)) - 1/2*np.dot(mean3.T, np.dot(inv_cov, mean3)) + np.log(pi3)
region1 = np.logical_and(delta1_fun(X) > delta2_fun(X), delta1_fun(X) > delta3_fun(X))
region2 = np.logical_and(delta2_fun(X) > delta1_fun(X), delta2_fun(X) > delta3_fun(X))
region3 = np.logical_and(delta3_fun(X) > delta1_fun(X), delta3_fun(X) > delta2_fun(X))
# LDA prediction
est_mean1 = 1/N*np.sum(sample1, axis=0)
est_mean2 = 1/N*np.sum(sample2, axis=0)
est_mean3 = 1/N*np.sum(sample3, axis=0)
est_cov = (np.cov(sample1, rowvar=False) + np.cov(sample2, rowvar=False) + np.cov(sample3, rowvar=False))/K
inv_est_cov = np.linalg.inv(est_cov)
est_delta1_fun = lambda X: np.dot(X, np.dot(inv_est_cov, est_mean1)) - 1/2*np.dot(est_mean1.T, np.dot(inv_est_cov, est_mean1)) + np.log(pi1)
est_delta2_fun = lambda X: np.dot(X, np.dot(inv_est_cov, est_mean2)) - 1/2*np.dot(est_mean2.T, np.dot(inv_est_cov, est_mean2)) + np.log(pi2)
est_delta3_fun = lambda X: np.dot(X, np.dot(inv_est_cov, est_mean3)) - 1/2*np.dot(est_mean3.T, np.dot(inv_est_cov, est_mean3)) + np.log(pi3)
est_region1 = np.logical_and(est_delta1_fun(X) > est_delta2_fun(X), est_delta1_fun(X) > est_delta3_fun(X))
est_region3 = np.logical_and(est_delta3_fun(X) > est_delta2_fun(X), est_delta3_fun(X) > est_delta1_fun(X))
### Plot Bayes regions and LDA lines
fig = plt.figure(figsize=(8,8))
ax = plt.subplot(1,1,1)
# Bayes regions
plt.contourf(xx, yy, region1.reshape(xx.shape), alpha=0.5, colors='g', levels=[0.5, 1.0])
plt.contourf(xx, yy, region2.reshape(xx.shape), alpha=0.5, colors='orange', levels=[0.5, 1.0])
plt.contourf(xx, yy, region3.reshape(xx.shape), alpha=0.5, colors='b', levels=[0.5, 1.0])
# Samples
ax.scatter(sample1[:,0], sample1[:,1], s=20, c='green', marker='o', label='1')
ax.scatter(sample2[:,0], sample2[:,1], s=20, c='orange', marker='o', label='2')
ax.scatter(sample3[:,0], sample3[:,1], s=20, c='blue', marker='o', label='3')
ax.set_xlabel('X1');
ax.set_ylabel('X2');
# LDA lines
plt.contour(xx, yy, est_region1.reshape(xx.shape), alpha=0.5, colors='k')
plt.contour(xx, yy, est_region3.reshape(xx.shape), alpha=0.5, colors='k');
# statistics
pred_green = sum([np.logical_and(delta1_fun(x) > delta2_fun(x), delta1_fun(x) > delta3_fun(x)) for x in sample1])/N*100
pred_orange = sum([np.logical_and(delta2_fun(x) > delta1_fun(x), delta2_fun(x) > delta3_fun(x)) for x in sample2])/N*100
pred_blue = sum([np.logical_and(delta3_fun(x) > delta2_fun(x), delta3_fun(x) > delta1_fun(x)) for x in sample3])/N*100
print('Bayes accuracy: ', np.round(pred_green, 1), np.round(pred_orange, 1), np.round(pred_blue, 1))
est_pred_green = sum([np.logical_and(est_delta1_fun(x) > est_delta2_fun(x), est_delta1_fun(x) > est_delta3_fun(x)) for x in sample1])/N*100
est_pred_orange = sum([np.logical_and(est_delta2_fun(x) > est_delta1_fun(x), est_delta2_fun(x) > est_delta3_fun(x)) for x in sample2])/N*100
est_pred_blue = sum([np.logical_and(est_delta3_fun(x) > est_delta2_fun(x), est_delta3_fun(x) > est_delta1_fun(x)) for x in sample3])/N*100
print('LDA accuracy: ', np.round(est_pred_green, 1), np.round(est_pred_orange, 1), np.round(est_pred_blue, 1))
interactive_plot = widgets.interactive(generate_LDA,
mean1=(-2,2,0.5), mean2=(-2,2,0.5), mean3=(-2,2,0.5),
sigma1=(0.1,5,0.1), sigma2=(0.1,5,0.1),
continuous_update=False);
output = interactive_plot.children[-1]
output.layout.height = '15cm'
interactive_plot
interactive(children=(FloatSlider(value=-2.0, description='mean1', max=2.0, min=-2.0, step=0.5), FloatSlider(v…
Default dataset
X = df_default[['balance', 'income', 'student2']].values
y = df_default.default2.values
lda = LinearDiscriminantAnalysis(solver='svd')
lda.fit(X, y)
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None, solver='svd', store_covariance=False, tol=0.0001)
print_classification_statistics(lda, X, y, labels=['Down', 'Up'])
plot_ROC(lda, X, y, label='LDA Classification')
Classification Report: precision recall f1-score support Down 0.974 0.998 0.986 9667 Up 0.782 0.237 0.364 333 avg / total 0.968 0.972 0.965 10000 Confusion Matrix: Predicted True False Real True 0.997724 0.002276 False 0.762763 0.237237
def plot_QDA(mean1=-2, mean2=1, sigma1=1, sigma2=0.5):
mean1 = mean1*np.array([1, 1])
mean2 = mean2*np.array([1, 1])
cov1 = np.array([[sigma1, 0], [0, sigma1]])
cov2 = np.array([[sigma2, 0], [0, sigma2]])
inv_cov1 = np.linalg.inv(cov1)
inv_cov2 = np.linalg.inv(cov2)
N = 500
K = 2
sample1 = np.random.multivariate_normal(mean1, cov1, (N,))
sample2 = np.random.multivariate_normal(mean2, cov2, (N,))
maxX = np.max([sample1[:,0], sample2[:,0]])
minX = np.min([sample1[:,0], sample2[:,0]])
maxY = np.max([sample1[:,1], sample2[:,1]])
minY = np.min([sample1[:,1], sample2[:,1]])
pi1 = pi2 = N/K
# grid of points to plot the bayes and LDA regions/lines
N_points_grid = 150
xx, yy = np.meshgrid(np.linspace(minX, maxX, N_points_grid), np.linspace(minY, maxY, N_points_grid))
X = np.c_[xx.ravel(), yy.ravel()]
delta1_fun = lambda Xin: -1/2*((Xin-mean1).dot(inv_cov1)*(Xin-mean1)).sum(axis=1) + np.log(pi1)
delta2_fun = lambda Xin: -1/2*((Xin-mean2).dot(inv_cov2)*(Xin-mean2)).sum(axis=1) + np.log(pi2)
region1 = delta1_fun(X) > delta2_fun(X)
region2 = delta2_fun(X) > delta1_fun(X)
# prediction
est_mean1 = 1/N*np.sum(sample1, axis=0)
est_mean2 = 1/N*np.sum(sample2, axis=0)
est_cov1 = np.cov(sample1, rowvar=False)
est_cov2 = np.cov(sample2, rowvar=False)
inv_est_cov1 = np.linalg.inv(est_cov1)
inv_est_cov2 = np.linalg.inv(est_cov2)
est_delta1_fun = lambda Xin: -1/2*((Xin-est_mean1).dot(inv_est_cov1)*(Xin-est_mean1)).sum(axis=1) + np.log(pi1)
est_delta2_fun = lambda Xin: -1/2*((Xin-est_mean2).dot(inv_est_cov2)*(Xin-est_mean2)).sum(axis=1) + np.log(pi2)
est_region1 = est_delta1_fun(X) > est_delta2_fun(X)
fig = plt.figure(figsize=(8,8))
ax = plt.subplot(1,1,1)
# Bayes regions
plt.contourf(xx, yy, region1.reshape(xx.shape), alpha=0.5, colors='g', levels=[0.5, 1.0])
plt.contourf(xx, yy, region2.reshape(xx.shape), alpha=0.5, colors='orange', levels=[0.5, 1.0])
# Samples
ax.scatter(sample1[:,0], sample1[:,1], s=20, c='green', marker='o',)
ax.scatter(sample2[:,0], sample2[:,1], s=20, c='orange', marker='o',)
ax.set_xlabel('X1');
ax.set_ylabel('X2');
# LDA lines
plt.contour(xx, yy, est_region1.reshape(xx.shape), alpha=0.5, colors='k');
pred_green = sum(delta1_fun(sample1) > delta2_fun(sample1))/N*100
pred_orange = sum(delta2_fun(sample2) > delta1_fun(sample2))/N*100
print('Bayes accuracy: ', np.round(pred_green, 1), np.round(pred_orange, 1))
est_pred_green = sum(est_delta1_fun(sample1) > est_delta2_fun(sample1))/N*100
est_pred_orange = sum(est_delta2_fun(sample2) > est_delta1_fun(sample2))/N*100
print('LDA accuracy: ', np.round(est_pred_green, 1), np.round(est_pred_orange, 1))
interactive_plot = widgets.interactive(plot_QDA,
mean1=(-2,2,0.5), mean2=(-2,2,0.5),
sigma1=(0.1,5,0.1), sigma2=(0.1,5,0.1),
continuous_update=False);
output = interactive_plot.children[-1]
output.layout.height = '15cm'
interactive_plot
interactive(children=(FloatSlider(value=-2.0, description='mean1', max=2.0, min=-2.0, step=0.5), FloatSlider(v…
Compare Logistic regression, LDA, QDA and KNN under different conditions
df_stock = pd.read_csv('Data/Smarket.csv', usecols=range(1,10), index_col=0, parse_dates=True)
# convert direction to binary. Up is 1, Down is 0
df_stock.replace({'Up': 1, 'Down': 0}, inplace=True)
df_stock.head()
Lag1 | Lag2 | Lag3 | Lag4 | Lag5 | Volume | Today | Direction | |
---|---|---|---|---|---|---|---|---|
Year | ||||||||
2001-01-01 | 0.381 | -0.192 | -2.624 | -1.055 | 5.010 | 1.1913 | 0.959 | 1 |
2001-01-01 | 0.959 | 0.381 | -0.192 | -2.624 | -1.055 | 1.2965 | 1.032 | 1 |
2001-01-01 | 1.032 | 0.959 | 0.381 | -0.192 | -2.624 | 1.4112 | -0.623 | 0 |
2001-01-01 | -0.623 | 1.032 | 0.959 | 0.381 | -0.192 | 1.2760 | 0.614 | 1 |
2001-01-01 | 0.614 | -0.623 | 1.032 | 0.959 | 0.381 | 1.2057 | 0.213 | 1 |
df_stock.describe()
Lag1 | Lag2 | Lag3 | Lag4 | Lag5 | Volume | Today | Direction | |
---|---|---|---|---|---|---|---|---|
count | 1250.000000 | 1250.000000 | 1250.000000 | 1250.000000 | 1250.00000 | 1250.000000 | 1250.000000 | 1250.000000 |
mean | 0.003834 | 0.003919 | 0.001716 | 0.001636 | 0.00561 | 1.478305 | 0.003138 | 0.518400 |
std | 1.136299 | 1.136280 | 1.138703 | 1.138774 | 1.14755 | 0.360357 | 1.136334 | 0.499861 |
min | -4.922000 | -4.922000 | -4.922000 | -4.922000 | -4.92200 | 0.356070 | -4.922000 | 0.000000 |
25% | -0.639500 | -0.639500 | -0.640000 | -0.640000 | -0.64000 | 1.257400 | -0.639500 | 0.000000 |
50% | 0.039000 | 0.039000 | 0.038500 | 0.038500 | 0.03850 | 1.422950 | 0.038500 | 1.000000 |
75% | 0.596750 | 0.596750 | 0.596750 | 0.596750 | 0.59700 | 1.641675 | 0.596750 | 1.000000 |
max | 5.733000 | 5.733000 | 5.733000 | 5.733000 | 5.73300 | 3.152470 | 5.733000 | 1.000000 |
df_stock.corr()
Lag1 | Lag2 | Lag3 | Lag4 | Lag5 | Volume | Today | Direction | |
---|---|---|---|---|---|---|---|---|
Lag1 | 1.000000 | -0.026294 | -0.010803 | -0.002986 | -0.005675 | 0.040910 | -0.026155 | -0.039757 |
Lag2 | -0.026294 | 1.000000 | -0.025897 | -0.010854 | -0.003558 | -0.043383 | -0.010250 | -0.024081 |
Lag3 | -0.010803 | -0.025897 | 1.000000 | -0.024051 | -0.018808 | -0.041824 | -0.002448 | 0.006132 |
Lag4 | -0.002986 | -0.010854 | -0.024051 | 1.000000 | -0.027084 | -0.048414 | -0.006900 | 0.004215 |
Lag5 | -0.005675 | -0.003558 | -0.018808 | -0.027084 | 1.000000 | -0.022002 | -0.034860 | 0.005423 |
Volume | 0.040910 | -0.043383 | -0.041824 | -0.048414 | -0.022002 | 1.000000 | 0.014592 | 0.022951 |
Today | -0.026155 | -0.010250 | -0.002448 | -0.006900 | -0.034860 | 0.014592 | 1.000000 | 0.730563 |
Direction | -0.039757 | -0.024081 | 0.006132 | 0.004215 | 0.005423 | 0.022951 | 0.730563 | 1.000000 |
# very small correlations (today and direction are obiously correlated)
corr = df_stock.corr().values
np.max(np.abs(np.triu(corr, k=1)), axis=1)
array([ 0.04090991, 0.04338321, 0.04182369, 0.04841425, 0.03486008, 0.02295096, 0.7305629 , 0. ])
# volume increases with year
plot = sns.boxplot(df_stock.index, df_stock['Volume'],)
plot.set_xticklabels([str(date.year) for date in df_stock.index.unique()]);
# volumne by year and direction
ax = sns.boxplot(df_stock.index, df_stock['Volume'], hue=df_stock['Direction'])
ax.set_xticklabels([str(date.year) for date in df_stock.index.unique()])
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, ["Down", "Up"]);
X = df_stock[df_stock.columns.difference(['Today', 'Direction'])]
y = df_stock['Direction']
X_train = X[:'2004']
y_train = y[:'2004']
X_test = X['2005':]
y_test = y['2005':]
logistic = skl_lm.LogisticRegression(C=1e10)
logistic.fit(X, y)
print_OLS_error_table(logistic, X, y)
print_classification_statistics(logistic, X, y, labels=['Down', 'Up'])
plot_ROC(logistic, X, y, label='Logistic Classification')
# same results as statsmodels
#smLogistic = sm.Logit(y, sm.add_constant(X)).fit()
#print(smLogistic.summary())
No. Observations: 1250 Df Residuals: 1243 Df Model: 6 Log-Likelihood: -863.79 AIC: 1741.58 Coefficients Standard Errors t values p values Intercept -0.1259 0.241 -0.523 0.601 Lag1 -0.0731 0.050 -1.457 0.145 Lag2 -0.0423 0.050 -0.845 0.399 Lag3 0.0111 0.050 0.222 0.824 Lag4 0.0094 0.050 0.187 0.851 Lag5 0.0103 0.050 0.208 0.835 Volume 0.1354 0.158 0.855 0.393 Classification Report: precision recall f1-score support Down 0.507 0.241 0.327 602 Up 0.526 0.782 0.629 648 avg / total 0.517 0.522 0.483 1250 Confusion Matrix: Predicted True False Real True 0.240864 0.759136 False 0.217593 0.782407
logistic_test = skl_lm.LogisticRegression(C=1e10)
logistic_test.fit(X_train, y_train)
print_OLS_error_table(logistic_test, X_train, y_train)
print_classification_statistics(logistic_test, X_test, y_test, labels=['Down', 'Up'])
plot_ROC(logistic_test, X_test, y_test, label='Logistic Classification Train/Test')
No. Observations: 998 Df Residuals: 991 Df Model: 6 Log-Likelihood: -690.55 AIC: 1395.11 Coefficients Standard Errors t values p values Intercept 0.1912 0.334 0.573 0.567 Lag1 -0.0542 0.052 -1.046 0.296 Lag2 -0.0458 0.052 -0.884 0.377 Lag3 0.0072 0.052 0.139 0.889 Lag4 0.0064 0.052 0.125 0.901 Lag5 -0.0042 0.051 -0.083 0.934 Volume -0.1162 0.240 -0.485 0.628 Classification Report: precision recall f1-score support Down 0.443 0.694 0.540 111 Up 0.564 0.312 0.402 141 avg / total 0.511 0.480 0.463 252 Confusion Matrix: Predicted True False Real True 0.693694 0.306306 False 0.687943 0.312057
# use only Lag1 and Lag2
X_train2 = X_train[['Lag1','Lag2']]
X_test2 = X_test[['Lag1','Lag2']]
lda = LinearDiscriminantAnalysis()
lda.fit(X_train2, y_train)
print('Prior probabilities of groups: ')
print(pd.DataFrame(data=lda.priors_.reshape((1,2)), columns=['Down', 'Up'], index=['']))
print()
print('Group means: ')
print(pd.DataFrame(data=lda.means_, columns=X_train2.columns, index=['Down', 'Up']))
print()
print('Coefficients of linear discriminant: ')
print(pd.DataFrame(data=lda.scalings_, columns=['LDA'], index=X_train2.columns))
print()
Prior probabilities of groups: Down Up 0.491984 0.508016 Group means: Lag1 Lag2 Down 0.042790 0.033894 Up -0.039546 -0.031325 Coefficients of linear discriminant: LDA Lag1 -0.642019 Lag2 -0.513529
print_classification_statistics(lda, X_test2, y_test, labels=['Down', 'Up'])
plot_ROC(lda, X_test2, y_test, label='LDA Train/Test, only Lag1 and Lag2')
plot_classification(lda, X_test2, y_test)
Classification Report: precision recall f1-score support Down 0.500 0.315 0.387 111 Up 0.582 0.752 0.656 141 avg / total 0.546 0.560 0.538 252 Confusion Matrix: Predicted True False Real True 0.315315 0.684685 False 0.248227 0.751773
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train2, y_train)
print('Prior probabilities of groups: ')
print(pd.DataFrame(data=qda.priors_.reshape((1,2)), columns=['Down', 'Up'], index=['']))
print()
print('Group means: ')
print(pd.DataFrame(data=qda.means_, columns=X_train2.columns, index=['Down', 'Up']))
print()
Prior probabilities of groups: Down Up 0.491984 0.508016 Group means: Lag1 Lag2 Down 0.042790 0.033894 Up -0.039546 -0.031325
print_classification_statistics(qda, X_test2, y_test, labels=['Down', 'Up'])
plot_ROC(qda, X_test2, y_test, label='QDA Train/Test, only Lag1 and Lag2')
plot_classification(qda, X_test2, y_test)
Classification Report: precision recall f1-score support Down 0.600 0.270 0.373 111 Up 0.599 0.858 0.706 141 avg / total 0.599 0.599 0.559 252 Confusion Matrix: Predicted True False Real True 0.270270 0.729730 False 0.141844 0.858156
n_neighbors = 3
knn = neighbors.KNeighborsClassifier(n_neighbors)
knn.fit(X_train2, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=3, p=2, weights='uniform')
print_classification_statistics(knn, X_test2, y_test, labels=['Down', 'Up'])
plot_ROC(knn, X_test2, y_test, label='KNN Train/Test, only Lag1 and Lag2')
plot_classification(knn, X_test2, y_test)
Classification Report: precision recall f1-score support Down 0.466 0.432 0.449 111 Up 0.577 0.610 0.593 141 avg / total 0.528 0.532 0.529 252 Confusion Matrix: Predicted True False Real True 0.432432 0.567568 False 0.390071 0.609929
df_caravan = pd.read_csv('Data/Caravan.csv')
df_caravan['Purchase'] = df_caravan['Purchase'].astype('category')
df_caravan.head()
Unnamed: 0 | MOSTYPE | MAANTHUI | MGEMOMV | MGEMLEEF | MOSHOOFD | MGODRK | MGODPR | MGODOV | MGODGE | ... | APERSONG | AGEZONG | AWAOREG | ABRAND | AZEILPL | APLEZIER | AFIETS | AINBOED | ABYSTAND | Purchase | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 33 | 1 | 3 | 2 | 8 | 0 | 5 | 1 | 3 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | No |
1 | 2 | 37 | 1 | 2 | 2 | 8 | 1 | 4 | 1 | 4 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | No |
2 | 3 | 37 | 1 | 2 | 2 | 8 | 0 | 4 | 2 | 4 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | No |
3 | 4 | 9 | 1 | 3 | 3 | 3 | 2 | 3 | 2 | 4 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | No |
4 | 5 | 40 | 1 | 4 | 2 | 10 | 1 | 4 | 1 | 4 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | No |
5 rows × 87 columns
df_caravan['Purchase'].value_counts()
No 5474 Yes 348 Name: Purchase, dtype: int64
y = df_caravan.Purchase
X = df_caravan.drop('Purchase', axis=1).astype('float64')
X_scaled = preprocessing.scale(X)
X_test = X_scaled[:1000]
y_test = y[:1000]
X_train = X_scaled[1000:]
y_train = y[1000:]
for i in [1, 3, 5]:
print(f'Using {i} neighbors')
knn = neighbors.KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
print_classification_statistics(knn, X_test, y_test, labels=['No', 'Yes'])
#plot_ROC(knn, X_test, y_test, label='KNN')
#skplt.metrics.plot_confusion_matrix(y_test, knn.predict(X_test), normalize=False)
plt.show()
Using 1 neighbors Classification Report: precision recall f1-score support No 0.948 0.937 0.943 941 Yes 0.157 0.186 0.171 59 avg / total 0.902 0.893 0.897 1000 Confusion Matrix: Predicted True False Real True 0.937301 0.062699 False 0.813559 0.186441 Using 3 neighbors Classification Report: precision recall f1-score support No 0.946 0.979 0.962 941 Yes 0.231 0.102 0.141 59 avg / total 0.903 0.927 0.913 1000 Confusion Matrix: Predicted True False Real True 0.978746 0.021254 False 0.898305 0.101695 Using 5 neighbors Classification Report: precision recall f1-score support No 0.944 0.993 0.968 941 Yes 0.364 0.068 0.114 59 avg / total 0.910 0.938 0.918 1000 Confusion Matrix: Predicted True False Real True 0.992561 0.007439 False 0.932203 0.067797
logistic = skl_lm.LogisticRegression(C=1e10)
logistic.fit(X_train, y_train)
print_classification_statistics(logistic, X_test, y_test, labels=['No', 'Yes'])
Classification Report: precision recall f1-score support No 0.941 0.994 0.966 941 Yes 0.000 0.000 0.000 59 avg / total 0.885 0.935 0.909 1000 Confusion Matrix: Predicted True False Real True 0.993624 0.006376 False 1.000000 0.000000
# using 25% changes of buying instead of 50%
pred_p = logistic.predict_proba(X_test)
cm_df = pd.DataFrame({'True': y_test, 'Pred': pred_p[:,1] > .25})
cm_df.Pred.replace(to_replace={True:'Yes', False:'No'}, inplace=True)
print(classification_report(y_test, cm_df.Pred))
print(cm_df.groupby(['True', 'Pred']).size().unstack('True').T)
precision recall f1-score support No 0.95 0.98 0.96 941 Yes 0.34 0.19 0.24 59 avg / total 0.91 0.93 0.92 1000 Pred No Yes True No 920 21 Yes 48 11