Imports¶

In [1]:

import pandas as pd
import os
import sklearn
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.cross_validation import train_test_split

print sklearn.__version__
print pd.__version__

0.19.1
0.20.3

In [2]:

# reading data to df
DATA_DIR = '../data'

df = pd.read_csv(os.path.abspath(os.path.join(DATA_DIR, 'day15/jobclass.csv')))
df.head(5)

Out[2]:

	ID	JobFamily	JobFamilyDescription	JobClass	JobClassDescription	PayGrade	EducationLevel	Experience	OrgImpact	ProblemSolving	Supervision	ContactLevel	FinancialBudget	PG
0	1	1	Accounting And Finance	1	Accountant I	5	3	1	3	3	4	3	5	PG05
1	2	1	Accounting And Finance	2	Accountant II	6	4	1	5	4	5	7	7	PG06
2	3	1	Accounting And Finance	3	Accountant III	8	4	2	6	5	6	7	10	PG08
3	4	1	Accounting And Finance	4	Accountant IV	10	5	5	6	6	7	8	11	PG10
4	5	2	Administrative Support	5	Admin Support I	1	1	0	1	1	1	1	1	PG01

In [3]:

# target is to predict the column PG using rest (all to one)
target = df['PG']

# dropping unnecessary columns and keeping just the concerned features
df.drop(['ID', 'JobFamilyDescription', 'JobClassDescription', 'PG'], axis=1, inplace=True)

In [4]:

df.head(2)

Out[4]:

	JobFamily	JobClass	PayGrade	EducationLevel	Experience	OrgImpact	ProblemSolving	Supervision	ContactLevel	FinancialBudget
0	1	1	5	3	1	3	3	4	3	5
1	1	2	6	4	1	5	4	5	7	7

In [5]:

# check for NaN (Missing values)
df.isnull().sum()

# Luckily not a single missing values

Out[5]:

JobFamily          0
JobClass           0
PayGrade           0
EducationLevel     0
Experience         0
OrgImpact          0
ProblemSolving     0
Supervision        0
ContactLevel       0
FinancialBudget    0
dtype: int64

In [6]:

X = df[:].values
Y = target.values

In [7]:

# ideal practice is to use test as 20% - 30% of training data
# defined by test_size in train_test_split()
# random_state is required to avoid sequential biasness in the data distribution
def data_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = data_split(X, Y)

Model - 1 [Random Forest]¶

In [8]:

clf1 = RandomForestClassifier(random_state=1)

Model - 2 [XGBoost]¶

In [9]:

clf2 = XGBClassifier()

Meta Model [Logit]¶

In [10]:

lr = LogisticRegression()
sclf = StackingClassifier(
                          classifiers = [clf1, clf2], 
                          meta_classifier=lr,
                          use_probas=True,
                          average_probas=False,
                         )

In [11]:

print('5-fold cross validation:\n')

for clf, label in zip([clf1, clf2, sclf], 
                      ['Random Forest', 
                       'XGBoost',
                       'StackingClassifier']):

    scores = sklearn.model_selection.cross_val_score(clf, X_train, Y_train, 
                                              cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

5-fold cross validation:

Accuracy: 0.70 (+/- 0.10) [Random Forest]
Accuracy: 0.92 (+/- 0.09) [XGBoost]
Accuracy: 0.86 (+/- 0.14) [StackingClassifier]