Motivation: evaluate variable correlations and multi-dimensional interactions to understand which features are the most influential towards income; moreover, what perhaps may we learn from those features which may not typically strike one as intuitive influencers of capital gain outcomes: social indices such as relationships, marital status, or race. Hypothesis: social indices will have a strong influence on income. Null Hypothesis: social indices will have no impact on income.
import pandas as pd
import seaborn as sns; sns.set_style('white')
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import sklearn
path = ('/Users/bryanevan/Downloads/adult.csv')
df=pd.read_csv(path, low_memory=False)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32561 entries, 0 to 32560 Data columns (total 15 columns): age 32561 non-null int64 workclass 32561 non-null object fnlwgt 32561 non-null int64 education 32561 non-null object education.num 32561 non-null int64 marital.status 32561 non-null object occupation 32561 non-null object relationship 32561 non-null object race 32561 non-null object sex 32561 non-null object capital.gain 32561 non-null int64 capital.loss 32561 non-null int64 hours.per.week 32561 non-null int64 native.country 32561 non-null object income 32561 non-null object dtypes: int64(6), object(9) memory usage: 3.7+ MB
# select data we want to regress onto income
data=df[['age',
'education',
'marital.status',
'occupation',
'race',
'sex',
'native.country',
'capital.gain',
'fnlwgt'
]]
# label encode to create integer values from objects
from sklearn.preprocessing import LabelEncoder
data = data.apply(LabelEncoder().fit_transform)
sns.set_style('white')
sns.heatmap(data.corr());
# turn categorical into binary
df['income'] = np.where(df['income'] == '<=50K',0,1)
X = data
y = df.income
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2,
random_state=1);
a=X_train.shape
b=y_test.shape
print(a)
print(b)
(26048, 9) (6513,)
# overview of performance of several different classifiers
# no change of distribution shapes
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
seed = 7
models = []
models.append(('LOG', LogisticRegression(solver='lbfgs', max_iter=1000)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('TREE', DecisionTreeClassifier()))
models.append(('BER', BernoulliNB()))
models.append(('SVC', SVC(gamma='auto')))
models.append(('RF', RandomForestClassifier(n_estimators=4, max_features='sqrt')))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_test, y_test, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
LOG: 0.784125 (0.014382) KNN: 0.747731 (0.016545) TREE: 0.787351 (0.009463) BER: 0.783509 (0.014596) SVC: 0.760017 (0.015505) RF: 0.824046 (0.015213)
# random forest looks solid, so run it
rf = RandomForestClassifier(n_estimators=4)
rf.fit(X, y)
rf.score(X, y)
rf_pred = rf.predict(X)
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
acc = accuracy_score(y, rf_pred)
clas = classification_report(y, rf_pred)
print('\nAccuracy:',acc)
print('\nClassification Report:'), print(clas)
# show important features
conf_mat = confusion_matrix(y, rf_pred)
print('\nConfusion Matrix:'), print(conf_mat)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(conf_mat)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
plt.show()
feature_importances = pd.DataFrame(rf.feature_importances_,
index = data.columns,
columns=['importance']).sort_values('importance',ascending=False)
feature_importances
Accuracy: 0.9587236264242499 Classification Report: precision recall f1-score support 0 0.95 0.99 0.97 24720 1 0.98 0.85 0.91 7841 micro avg 0.96 0.96 0.96 32561 macro avg 0.97 0.92 0.94 32561 weighted avg 0.96 0.96 0.96 32561 Confusion Matrix: [[24574 146] [ 1198 6643]]
importance | |
---|---|
fnlwgt | 0.281053 |
age | 0.182338 |
capital.gain | 0.148843 |
marital.status | 0.146085 |
education | 0.100132 |
occupation | 0.084411 |
sex | 0.023993 |
native.country | 0.020882 |
race | 0.012263 |
# log came back with promising results, so run many iterations
from scipy import stats
import statsmodels.api as sm
logit = sm.Logit(y, X)
result = logit.fit()
log_pred = result.predict(X)
# Code result as 1 if probability is greater than .5.
log_outcome = np.where(log_pred < .5, 0, 1)
table = pd.crosstab(y, log_outcome)
print('\n Accuracy by Income Ranking')
print(table)
print('\n Percentage accuracy')
print((table.iloc[0,0] + table.iloc[1,1]) / (table.sum().sum()))
print(result.summary())
plt.figure(figsize=(6,6))
sns.heatmap(table, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'viridis_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(table)
plt.title(all_sample_title, size = 15);
Optimization terminated successfully. Current function value: 0.464976 Iterations 6 Accuracy by Income Ranking col_0 0 1 income 0 23616 1104 1 5992 1849 Percentage accuracy 0.7820705752280336 Logit Regression Results ============================================================================== Dep. Variable: income No. Observations: 32561 Model: Logit Df Residuals: 32552 Method: MLE Df Model: 8 Date: Tue, 26 Feb 2019 Pseudo R-squ.: 0.1577 Time: 18:45:15 Log-Likelihood: -15140. converged: True LL-Null: -17974. LLR p-value: 0.000 ================================================================================== coef std err z P>|z| [0.025 0.975] ---------------------------------------------------------------------------------- age 0.0199 0.001 18.526 0.000 0.018 0.022 education 0.0115 0.004 3.201 0.001 0.004 0.018 marital.status -0.3604 0.010 -35.223 0.000 -0.380 -0.340 occupation 0.0149 0.003 4.466 0.000 0.008 0.021 race -0.1554 0.015 -10.628 0.000 -0.184 -0.127 sex 0.9583 0.035 27.640 0.000 0.890 1.026 native.country -0.0292 0.001 -20.052 0.000 -0.032 -0.026 capital.gain 0.0278 0.001 43.301 0.000 0.027 0.029 fnlwgt -2.042e-05 2.29e-06 -8.915 0.000 -2.49e-05 -1.59e-05 ==================================================================================
from sklearn import linear_model
ridge = linear_model.Ridge(alpha = 100, fit_intercept=True)
result = ridge.fit(X,y)
ridge_pred = result.predict(X)
# Code result as 1 if probability is greater than .5.
ridge_outcome = np.where(ridge_pred < .5, 0, 1)
ridge_table = pd.crosstab(y, ridge_outcome)
rr = linear_model.Ridge(alpha=0.01)
rr.fit(X, y)
rr100 = linear_model.Ridge(alpha=100) # comparison with alpha value
rr100.fit(X, y)
Ridge_score = rr.score(X, y)
Ridge100_score = rr100.score(X,y)
print ("ridge regression low alpha:", Ridge_score)
print ("ridge regression high alpha:", Ridge100_score)
plt.plot(rr.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Ridge; $\alpha = 0.01$',zorder=7) # zorder for ordering the markers
plt.plot(rr100.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Ridge; $\alpha = 100$') # alpha here is for transparency
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.show()
plt.figure(figsize=(6,6))
sns.heatmap(ridge_table, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'viridis_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(ridge_table)
plt.title(all_sample_title, size = 15);
ridge regression low alpha: 0.2056518264938597 ridge regression high alpha: 0.20564698274812998
import math
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso.fit(X,y)
lasso_score=lasso.score(X,y)
coeff_used = np.sum(lasso.coef_!=0)
print ("Default score:", lasso_score)
print ("number of features used: ", coeff_used)
lasso_pred = result.predict(X)
lasso_outcome = np.where(lasso_pred < .5, 0, 1)
lasso_table = pd.crosstab(y, lasso_outcome)
lasso001 = Lasso(alpha=0.01, max_iter=10e5)
lasso001.fit(X,y)
lasso_score001=lasso001.score(X,y)
coeff_used001 = np.sum(lasso001.coef_!=0)
print ("Score for alpha=0.01:", lasso_score001)
print ("number of features used: for alpha =0.01:", coeff_used001)
lasso00001 = Lasso(alpha=0.0001, max_iter=10e5)
lasso00001.fit(X,y)
lasso_score00001=lasso00001.score(X,y)
coeff_used00001 = np.sum(lasso00001.coef_!=0)
print ("Score for alpha=0.0001:", lasso_score00001)
print ("number of features used: for alpha =0.0001:", coeff_used00001)
plt.subplot(1,2,1)
plt.plot(lasso.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Lasso; $\alpha = 1$',zorder=7) # alpha here is for transparency
plt.plot(lasso001.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Lasso; $\alpha = 0.01$') # alpha here is for transparency
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.subplot(1,2,2)
plt.plot(lasso.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Lasso; $\alpha = 1$',zorder=7) # alpha here is for transparency
plt.plot(lasso001.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Lasso; $\alpha = 0.01$') # alpha here is for transparency
plt.plot(lasso00001.coef_,alpha=0.8,linestyle='none',marker='v',markersize=6,color='black',label=r'Lasso; $\alpha = 0.00001$') # alpha here is for transparency
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.tight_layout()
plt.show()
plt.figure(figsize=(6,6))
sns.heatmap(lasso_table, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'viridis_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(lasso_table)
plt.title(all_sample_title, size = 15);
Default score: 0.1169548534483098 number of features used: 3 Score for alpha=0.01: 0.2024875209076583 number of features used: for alpha =0.01: 9 Score for alpha=0.0001: 0.2056515105918646 number of features used: for alpha =0.0001: 9
# NB showed up well also, so run it
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X,y)
bnb_pred = bnb.predict(X)
print("Missed points out of {} points : {}".format(
data.shape[0],
(y != bnb_pred).sum()
))
bnb_mat = confusion_matrix(y, bnb_pred)
print('\nBernoulli Confusion Matrix:'), print(bnb_mat)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(bnb_mat)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
plt.show()
Missed points out of 32561 points : 7115 Bernoulli Confusion Matrix: [[24164 556] [ 6559 1282]]
# standardize b4 KNN and pca
from sklearn.model_selection import train_test_split
pca_X_train, pca_X_test, pca_y_train, pca_y_test = train_test_split(X, y,
test_size=0.2,
random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
pca_X_train = sc.fit_transform(pca_X_train)
pca_X_test = sc.transform(pca_X_test)
/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/data.py:625: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler. return self.partial_fit(X, y) /anaconda3/lib/python3.7/site-packages/sklearn/base.py:462: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler. return self.fit(X, **fit_params).transform(X) /anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:10: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler. # Remove the CWD from sys.path while we load stuff.
# let's give KNN a chance to predict, so parameterize features w/ pca
from sklearn.decomposition import PCA
pca=PCA().fit(pca_X_train)
plt.figure()
plt.style.available[:1]
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Explained Variance')
plt.show()
pca = PCA(n_components=8)
stdr = pca.fit_transform(pca_X_train)
cov_mat = np.cov(pca_X_train.T)
eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat)
for i in range(len(eig_val_cov)):
eigvec_cov = eig_vec_cov[:, i]
print('Eigenvalue {}: {}'.format(i + 1, eig_val_cov[i]))
print(40 * '-')
Eigenvalue 1: 1.4445872559592263 ---------------------------------------- Eigenvalue 2: 0.7102981013014953 ---------------------------------------- Eigenvalue 3: 1.1564833975444346 ---------------------------------------- Eigenvalue 4: 1.0886276127593044 ---------------------------------------- Eigenvalue 5: 0.8236900822816902 ---------------------------------------- Eigenvalue 6: 0.8666543710573236 ---------------------------------------- Eigenvalue 7: 0.9286983602516429 ---------------------------------------- Eigenvalue 8: 0.9933239886562135 ---------------------------------------- Eigenvalue 9: 0.9879823594242787 ----------------------------------------
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV, cross_val_score
# run several iterations to find best number of neighbors
params = {'n_neighbors':[2,3,4,5,6,7,8,9]}
knn = neighbors.KNeighborsClassifier()
model = GridSearchCV(knn, params, cv=10)
model.fit(pca_X_train,pca_y_train)
model.best_params_
{'n_neighbors': 8}
# KNN Regression
knn=KNeighborsClassifier(8, weights='distance')
knn.fit(pca_X_train,pca_y_train)
knn_cv = cross_val_score(knn,pca_X_test,pca_y_test, cv=10)
print(knn_cv)
print(knn.score(pca_X_test, pca_y_test))
[0.82055215 0.80214724 0.81441718 0.80214724 0.80521472 0.82055215 0.8202765 0.85692308 0.83076923 0.82769231] 0.822201750345463
error = []
# Calculating error for K values between 1 and 40
for i in range(1, 40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(pca_X_train, pca_y_train)
pred_i = knn.predict(pca_X_test)
error.append(np.mean(pred_i != pca_y_test))
plt.figure(figsize=(8, 4))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')
Text(0, 0.5, 'Mean Error')