import pandas as pd
import os
import sklearn
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.cross_validation import train_test_split
print sklearn.__version__
print pd.__version__
0.19.1 0.20.3
# reading data to df
DATA_DIR = '../data'
df = pd.read_csv(os.path.abspath(os.path.join(DATA_DIR, 'day15/jobclass.csv')))
df.head(5)
ID | JobFamily | JobFamilyDescription | JobClass | JobClassDescription | PayGrade | EducationLevel | Experience | OrgImpact | ProblemSolving | Supervision | ContactLevel | FinancialBudget | PG | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | Accounting And Finance | 1 | Accountant I | 5 | 3 | 1 | 3 | 3 | 4 | 3 | 5 | PG05 |
1 | 2 | 1 | Accounting And Finance | 2 | Accountant II | 6 | 4 | 1 | 5 | 4 | 5 | 7 | 7 | PG06 |
2 | 3 | 1 | Accounting And Finance | 3 | Accountant III | 8 | 4 | 2 | 6 | 5 | 6 | 7 | 10 | PG08 |
3 | 4 | 1 | Accounting And Finance | 4 | Accountant IV | 10 | 5 | 5 | 6 | 6 | 7 | 8 | 11 | PG10 |
4 | 5 | 2 | Administrative Support | 5 | Admin Support I | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | PG01 |
# target is to predict the column PG using rest (all to one)
target = df['PG']
# dropping unnecessary columns and keeping just the concerned features
df.drop(['ID', 'JobFamilyDescription', 'JobClassDescription', 'PG'], axis=1, inplace=True)
df.head(2)
JobFamily | JobClass | PayGrade | EducationLevel | Experience | OrgImpact | ProblemSolving | Supervision | ContactLevel | FinancialBudget | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 5 | 3 | 1 | 3 | 3 | 4 | 3 | 5 |
1 | 1 | 2 | 6 | 4 | 1 | 5 | 4 | 5 | 7 | 7 |
# check for NaN (Missing values)
df.isnull().sum()
# Luckily not a single missing values
JobFamily 0 JobClass 0 PayGrade 0 EducationLevel 0 Experience 0 OrgImpact 0 ProblemSolving 0 Supervision 0 ContactLevel 0 FinancialBudget 0 dtype: int64
X = df[:].values
Y = target.values
# ideal practice is to use test as 20% - 30% of training data
# defined by test_size in train_test_split()
# random_state is required to avoid sequential biasness in the data distribution
def data_split(X, Y):
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)
return X_train, X_test, Y_train, Y_test
X_train, X_test, Y_train, Y_test = data_split(X, Y)
clf1 = RandomForestClassifier(random_state=1)
clf2 = XGBClassifier()
lr = LogisticRegression()
sclf = StackingClassifier(
classifiers = [clf1, clf2],
meta_classifier=lr,
use_probas=True,
average_probas=False,
)
print('5-fold cross validation:\n')
for clf, label in zip([clf1, clf2, sclf],
['Random Forest',
'XGBoost',
'StackingClassifier']):
scores = sklearn.model_selection.cross_val_score(clf, X_train, Y_train,
cv=5, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), label))
5-fold cross validation: Accuracy: 0.70 (+/- 0.10) [Random Forest] Accuracy: 0.92 (+/- 0.09) [XGBoost] Accuracy: 0.86 (+/- 0.14) [StackingClassifier]