Notebook

Motivation: evaluate variable correlations and multi-dimensional interactions to understand which features are the most influential towards income; moreover, what perhaps may we learn from those features which may not typically strike one as intuitive influencers of capital gain outcomes: social indices such as relationships, marital status, or race. Hypothesis: social indices will have a strong influence on income. Null Hypothesis: social indices will have no impact on income.

In [1]:

import pandas as pd
import seaborn as sns; sns.set_style('white')
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import sklearn

In [2]:

path = ('/Users/bryanevan/Downloads/adult.csv')
df=pd.read_csv(path, low_memory=False)

In [3]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    32561 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB

In [4]:

# select data we want to regress onto income
data=df[['age',
        'education',
        'marital.status',
        'occupation',
        'race',
        'sex',
        'native.country',
        'capital.gain',
         'fnlwgt'
         ]]

In [5]:

# label encode to create integer values from objects
from sklearn.preprocessing import LabelEncoder
data = data.apply(LabelEncoder().fit_transform)

In [6]:

sns.set_style('white')
sns.heatmap(data.corr());

In [7]:

# turn categorical into binary
df['income'] = np.where(df['income'] == '<=50K',0,1)

In [8]:

X = data
y = df.income
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=1);

In [9]:

a=X_train.shape
b=y_test.shape
print(a)
print(b)

(26048, 9)
(6513,)

In [10]:

# overview of performance of several different classifiers
# no change of distribution shapes
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
seed = 7
models = []
models.append(('LOG', LogisticRegression(solver='lbfgs', max_iter=1000)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('TREE', DecisionTreeClassifier()))
models.append(('BER', BernoulliNB()))
models.append(('SVC', SVC(gamma='auto')))
models.append(('RF', RandomForestClassifier(n_estimators=4, max_features='sqrt')))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'

for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, X_test, y_test, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

LOG: 0.784125 (0.014382)
KNN: 0.747731 (0.016545)
TREE: 0.787351 (0.009463)
BER: 0.783509 (0.014596)
SVC: 0.760017 (0.015505)
RF: 0.824046 (0.015213)

In [11]:

# random forest looks solid, so run it
rf = RandomForestClassifier(n_estimators=4)
rf.fit(X, y)
rf.score(X, y)
rf_pred = rf.predict(X)
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
acc = accuracy_score(y, rf_pred)
clas = classification_report(y, rf_pred)
print('\nAccuracy:',acc)
print('\nClassification Report:'), print(clas)
# show important features
conf_mat = confusion_matrix(y, rf_pred)
print('\nConfusion Matrix:'), print(conf_mat)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(conf_mat)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
plt.show()

feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = data.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances

Accuracy: 0.9587236264242499

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     24720
           1       0.98      0.85      0.91      7841

   micro avg       0.96      0.96      0.96     32561
   macro avg       0.97      0.92      0.94     32561
weighted avg       0.96      0.96      0.96     32561


Confusion Matrix:
[[24574   146]
 [ 1198  6643]]

Out[11]:

	importance
fnlwgt	0.281053
age	0.182338
capital.gain	0.148843
marital.status	0.146085
education	0.100132
occupation	0.084411
sex	0.023993
native.country	0.020882
race	0.012263

In [12]:

# log came back with promising results, so run many iterations
from scipy import stats
import statsmodels.api as sm

logit = sm.Logit(y, X)
result = logit.fit()
log_pred = result.predict(X)
# Code result as 1 if probability is greater than .5.
log_outcome = np.where(log_pred < .5, 0, 1)
table = pd.crosstab(y, log_outcome)

print('\n Accuracy by Income Ranking')
print(table)
print('\n Percentage accuracy')
print((table.iloc[0,0] + table.iloc[1,1]) / (table.sum().sum()))
print(result.summary())

plt.figure(figsize=(6,6))
sns.heatmap(table, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'viridis_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(table)
plt.title(all_sample_title, size = 15);

Optimization terminated successfully.
         Current function value: 0.464976
         Iterations 6

 Accuracy by Income Ranking
col_0       0     1
income             
0       23616  1104
1        5992  1849

 Percentage accuracy
0.7820705752280336
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                 income   No. Observations:                32561
Model:                          Logit   Df Residuals:                    32552
Method:                           MLE   Df Model:                            8
Date:                Tue, 26 Feb 2019   Pseudo R-squ.:                  0.1577
Time:                        18:45:15   Log-Likelihood:                -15140.
converged:                       True   LL-Null:                       -17974.
                                        LLR p-value:                     0.000
==================================================================================
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
age                0.0199      0.001     18.526      0.000       0.018       0.022
education          0.0115      0.004      3.201      0.001       0.004       0.018
marital.status    -0.3604      0.010    -35.223      0.000      -0.380      -0.340
occupation         0.0149      0.003      4.466      0.000       0.008       0.021
race              -0.1554      0.015    -10.628      0.000      -0.184      -0.127
sex                0.9583      0.035     27.640      0.000       0.890       1.026
native.country    -0.0292      0.001    -20.052      0.000      -0.032      -0.026
capital.gain       0.0278      0.001     43.301      0.000       0.027       0.029
fnlwgt         -2.042e-05   2.29e-06     -8.915      0.000   -2.49e-05   -1.59e-05
==================================================================================

In [13]:

from sklearn import linear_model
ridge = linear_model.Ridge(alpha = 100, fit_intercept=True)
result = ridge.fit(X,y)
ridge_pred = result.predict(X)
# Code result as 1 if probability is greater than .5.
ridge_outcome = np.where(ridge_pred < .5, 0, 1)
ridge_table = pd.crosstab(y, ridge_outcome)

rr = linear_model.Ridge(alpha=0.01)
rr.fit(X, y)
rr100 = linear_model.Ridge(alpha=100) #  comparison with alpha value
rr100.fit(X, y)

Ridge_score = rr.score(X, y)
Ridge100_score = rr100.score(X,y)
print ("ridge regression low alpha:", Ridge_score)
print ("ridge regression high alpha:", Ridge100_score)
plt.plot(rr.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Ridge; $\alpha = 0.01$',zorder=7) # zorder for ordering the markers
plt.plot(rr100.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Ridge; $\alpha = 100$') # alpha here is for transparency
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.show()

plt.figure(figsize=(6,6))
sns.heatmap(ridge_table, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'viridis_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(ridge_table)
plt.title(all_sample_title, size = 15);

ridge regression low alpha: 0.2056518264938597
ridge regression high alpha: 0.20564698274812998

In [14]:

import math
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso.fit(X,y)
lasso_score=lasso.score(X,y)
coeff_used = np.sum(lasso.coef_!=0)
print ("Default score:", lasso_score)
print ("number of features used: ", coeff_used)

lasso_pred = result.predict(X)
lasso_outcome = np.where(lasso_pred < .5, 0, 1)
lasso_table = pd.crosstab(y, lasso_outcome)

lasso001 = Lasso(alpha=0.01, max_iter=10e5)
lasso001.fit(X,y)
lasso_score001=lasso001.score(X,y)
coeff_used001 = np.sum(lasso001.coef_!=0)
print ("Score for alpha=0.01:", lasso_score001)
print ("number of features used: for alpha =0.01:", coeff_used001)

lasso00001 = Lasso(alpha=0.0001, max_iter=10e5)
lasso00001.fit(X,y)
lasso_score00001=lasso00001.score(X,y)
coeff_used00001 = np.sum(lasso00001.coef_!=0)
print ("Score for alpha=0.0001:", lasso_score00001) 
print ("number of features used: for alpha =0.0001:", coeff_used00001)

plt.subplot(1,2,1)
plt.plot(lasso.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Lasso; $\alpha = 1$',zorder=7) # alpha here is for transparency
plt.plot(lasso001.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Lasso; $\alpha = 0.01$') # alpha here is for transparency
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)

plt.subplot(1,2,2)
plt.plot(lasso.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Lasso; $\alpha = 1$',zorder=7) # alpha here is for transparency
plt.plot(lasso001.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Lasso; $\alpha = 0.01$') # alpha here is for transparency
plt.plot(lasso00001.coef_,alpha=0.8,linestyle='none',marker='v',markersize=6,color='black',label=r'Lasso; $\alpha = 0.00001$') # alpha here is for transparency
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,6))
sns.heatmap(lasso_table, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'viridis_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(lasso_table)
plt.title(all_sample_title, size = 15);

Default score: 0.1169548534483098
number of features used:  3
Score for alpha=0.01: 0.2024875209076583
number of features used: for alpha =0.01: 9
Score for alpha=0.0001: 0.2056515105918646
number of features used: for alpha =0.0001: 9

In [15]:

# NB showed up well also, so run it 
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X,y)
bnb_pred = bnb.predict(X)

print("Missed points out of {} points : {}".format(
data.shape[0],
(y != bnb_pred).sum()
))
bnb_mat = confusion_matrix(y, bnb_pred)
print('\nBernoulli Confusion Matrix:'), print(bnb_mat)

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(bnb_mat)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
plt.show()

Missed points out of 32561 points : 7115

Bernoulli Confusion Matrix:
[[24164   556]
 [ 6559  1282]]

In [16]:

# standardize b4 KNN and pca
from sklearn.model_selection import train_test_split
pca_X_train, pca_X_test, pca_y_train, pca_y_test = train_test_split(X, y, 
                                                                    test_size=0.2, 
                                                                    random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  
pca_X_train = sc.fit_transform(pca_X_train)  
pca_X_test = sc.transform(pca_X_test)  

/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/data.py:625: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
/anaconda3/lib/python3.7/site-packages/sklearn/base.py:462: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  return self.fit(X, **fit_params).transform(X)
/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:10: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  # Remove the CWD from sys.path while we load stuff.

In [17]:

# let's give KNN a chance to predict, so parameterize features w/ pca
from sklearn.decomposition import PCA
pca=PCA().fit(pca_X_train)
plt.figure()
plt.style.available[:1]
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Explained Variance')
plt.show()

In [18]:

pca = PCA(n_components=8)  
stdr = pca.fit_transform(pca_X_train)  
cov_mat = np.cov(pca_X_train.T)
eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat)

for i in range(len(eig_val_cov)):
    eigvec_cov = eig_vec_cov[:, i]
    print('Eigenvalue {}: {}'.format(i + 1, eig_val_cov[i]))
    print(40 * '-')

Eigenvalue 1: 1.4445872559592263
----------------------------------------
Eigenvalue 2: 0.7102981013014953
----------------------------------------
Eigenvalue 3: 1.1564833975444346
----------------------------------------
Eigenvalue 4: 1.0886276127593044
----------------------------------------
Eigenvalue 5: 0.8236900822816902
----------------------------------------
Eigenvalue 6: 0.8666543710573236
----------------------------------------
Eigenvalue 7: 0.9286983602516429
----------------------------------------
Eigenvalue 8: 0.9933239886562135
----------------------------------------
Eigenvalue 9: 0.9879823594242787
----------------------------------------

In [19]:

from sklearn import neighbors
from sklearn.model_selection import GridSearchCV, cross_val_score
# run several iterations to find best number of neighbors
params = {'n_neighbors':[2,3,4,5,6,7,8,9]}
knn = neighbors.KNeighborsClassifier()
model = GridSearchCV(knn, params, cv=10)
model.fit(pca_X_train,pca_y_train)
model.best_params_

Out[19]:

{'n_neighbors': 8}

In [20]:

# KNN Regression
knn=KNeighborsClassifier(8, weights='distance')
knn.fit(pca_X_train,pca_y_train)

knn_cv = cross_val_score(knn,pca_X_test,pca_y_test, cv=10)
print(knn_cv)
print(knn.score(pca_X_test, pca_y_test))

[0.82055215 0.80214724 0.81441718 0.80214724 0.80521472 0.82055215
 0.8202765  0.85692308 0.83076923 0.82769231]
0.822201750345463

In [21]:

error = []
# Calculating error for K values between 1 and 40
for i in range(1, 40):  
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(pca_X_train, pca_y_train)
    pred_i = knn.predict(pca_X_test)
    error.append(np.mean(pred_i != pca_y_test))

plt.figure(figsize=(8, 4))  
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',  
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')  
plt.xlabel('K Value')  
plt.ylabel('Mean Error')  

Out[21]:

Text(0, 0.5, 'Mean Error')