Workflow

  1. Objective - Define Problem and Metrics
  2. Import and Store Data
  3. Data Exploration and Data Cleaning
  4. Feature Engineering from Raw Data
  5. Classification (Modelling)
  6. Communicating Results (visualizations included)

1. Objective

Implement a "Hand Geometry based Person Identification" system using supervised learning. Training & Testing data with labels is provided. The classification system will be measured by accuracy.

In [635]:
#Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [24, 10]
sns.set(style="darkgrid")
plt.rcParams['figure.figsize'] = [24, 10]

2. Import & Store Data

In [72]:
def getxLabel(x):
    return "{:02d}x".format(x)

def getyLabel(y):
    return "{:02d}y".format(y)
In [706]:
NPOINTS = 23
data_cols = ['label'] 
for p in range(1,NPOINTS+1):
    data_cols.append(getxLabel(p))
    data_cols.append(getyLabel(p))
df_train = pd.read_csv('Assignment2-handtrainfile.txt',delimiter=' ',header=None,names=data_cols)
df_test = pd.read_csv('Assignment2-handtestfile.txt',delimiter=' ',header=None,names=data_cols)

print("TRAIN DATA:", df_train.shape, "#ofUniqIDs:",len(df_train.label.unique()))
print("TEST DATA:", df_test.shape)
TRAIN DATA: (100, 47) #ofUniqIDs: 20
TEST DATA: (100, 47)
In [707]:
#Convert labels to integer codes
df_train['label'], mapping_index = pd.Series(df_train['label']).factorize()

3. Data Exploration and Cleaning

  • what kind of data do we have? => Numerical only
  • Is there any missing Data? => No.
  • Are there outliers, should I care> => No.
In [843]:
#Plot highlevel view of data to confirm it is what we expect
sns.set_style("white")
def plotHexBins(df):
    x = df['01x'].values
    y = df['01y'].values

    for p in range(2,NPOINTS+1):
        x = np.append(x,df[getxLabel(p)].values)
        y = np.append(y,df[getyLabel(p)].values)

    g = sns.jointplot(x=x, y=y, kind="hex", color="m", height=16, marginal_kws=dict(bins=50, rug=True),
                      gridsize=50)
    # g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
    # g.ax_joint.collections[0].set_alpha(0)
    g.set_axis_labels("$X$", "$Y$")
    g.fig.axes[0].invert_yaxis()
    # g.fig.axes[0].invert_xaxis()
    plt.show()

sns.despine()
plotHexBins(df_train)
plotHexBins(df_test)
<Figure size 1152x864 with 0 Axes>

4. Feature Engineering

  • Distance between each point?
  • triangulation of all pairs? area of triangle?
  • Removing highly correlated?
In [856]:
%%time
from sklearn.metrics.pairwise import paired_distances

def getListofPoints(df, p):
    return df[[getxLabel(p),getyLabel(p)]].values.tolist()

def getEdColname(p1,p2):
    return "{:02d}_e_{:02d}".format(p1,p2)
    
def getMdColname(p1,p2):
    return "{:02d}_m_{:02d}".format(p1,p2)
    
def getDistances(df):
    '''
    Calculates euclidean and manhattan distances between all possible pairs
    and stores the distances in the data frame.
    RETURNS:
        - DataFrame
    '''
    
    for p1 in range(1,NPOINTS+1):
        p1s = getListofPoints(df, p1)
        for p2 in range(p1+1,NPOINTS+1):
            if p1==p2:
                continue
            p2s = getListofPoints(df, p2)
            colname = getEdColname(p1,p2)
            df[colname] = paired_distances(p1s,p2s,'euclidean')
            colname = getMdColname(p1,p2)
            df[colname] = paired_distances(p1s,p2s,'manhattan')
    return df


df_trainf = getDistances(df_train.copy())
df_testf = getDistances(df_test.copy())
CPU times: user 850 ms, sys: 0 ns, total: 850 ms
Wall time: 858 ms
In [857]:
%%time
import itertools

def getPermiterColname(comb):
    return "P_{:02d}_{:02d}_{:02d}".format(comb[0],comb[1],comb[2])

def getTAreaColname(comb):
    return "TA_{:02d}_{:02d}_{:02d}".format(comb[0],comb[1],comb[2])

def getSideLength(df,p1,p2):
    try: 
        sl = df[getEdColname(p1,p2)]
    except:
        sl = df[getEdColname(p2,p1)]
    return sl

def getTriangleFeatures(df):
    '''
    Calculates Triangle perimeter and area between all possible triplets of points
    and stores the values in the data frame.
    RETURNS:
        - DataFrame
    ''''
    for comb in itertools.combinations(range(1,NPOINTS+1), 3):
        a = getSideLength(df,comb[0],comb[1])
        b = getSideLength(df,comb[0],comb[2])
        c = getSideLength(df,comb[1],comb[2])
        df[getPermiterColname(comb)] = (a + b + c)

        s = df[getPermiterColname(comb)] / 2
        df[getTAreaColname(comb)] = np.sqrt((s*(s-a)*(s-b)*(s-c)))
    
    return df

df_trainf = getTriangleFeatures(df_trainf)
df_testf = getTriangleFeatures(df_testf)
CPU times: user 6.51 s, sys: 0 ns, total: 6.51 s
Wall time: 6.51 s
In [858]:
%%time
def getRidOfPointVars(df):
    '''
    Drops the raw location points (they are not valuable features, we do not need them anymore)
    RETURNS:
        - DataFrame
    ''''
    to_drop = []
    for p in range(1,NPOINTS+1):
        to_drop.append(getxLabel(p))
        to_drop.append(getyLabel(p))
    return df.drop(columns=to_drop)

df_trainf = getRidOfPointVars(df_trainf)
df_testf = getRidOfPointVars(df_testf)

print('TOTAL FEATURE COLUMNS:', len(df_trainf.columns))

#drop highly correlated features

# Compute the correlation matrix
corr = df_trainf.drop(columns=['label']).corr()

# Select upper triangle of correlation matrix
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.90 and drop them from both training and test
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

df_trainf = df_trainf.drop(columns=to_drop)
df_testf = df_testf.drop(columns=to_drop)


feature_columns = df_trainf.drop(columns=['label']).columns
print('FEAT COLUMNS REMAINING:',len(feature_columns))

#Convert data to Numpy Arrays
y = df_trainf.label.values
X = df_trainf.drop(columns=['label']).values

y_test = df_testf.label.values
X_test = df_testf.drop(columns=['label']).values
TOTAL FEATURE COLUMNS: 4049
FEAT COLUMNS REMAINING: 460
CPU times: user 4.02 s, sys: 160 ms, total: 4.18 s
Wall time: 4.06 s

5. Classification (Modelling)

  • Feature Scaling
  • Feature selection / reduction
  • Stratified Cross Validation
  • Testing Classifiers / hyper parameter tuning
In [859]:
#Copy training data into other variables
X_train, y_train = X, y 
In [860]:
%%time

#Import all classification and pre processing libraries we need
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.svm import SVC, LinearSVC
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

from sklearn.ensemble import RandomForestClassifier

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.gaussian_process.kernels import RBF

def get_name(estimator):
    name = estimator.__class__.__name__
    if name == 'Pipeline':
        name = [get_name(est[1]) for est in estimator.steps]
        name = ' + '.join(name)
    return name
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 66.8 ┬Ás
In [864]:
%%time

#Define pipeline consisting of scaling, reduction, and classification
pipe = Pipeline([
    ('scaler', preprocessing.MinMaxScaler()),
    ('reduce_dim', PCA()),
    ('classify', LinearSVC())
])

#Create arrays of possible options for pipeline
scalers = [preprocessing.MinMaxScaler(),preprocessing.StandardScaler()]
classifiers = [LinearSVC(),SVC(gamma='scale'),RandomForestClassifier(), GaussianProcessClassifier()]
MAX_ITER_OPT = range(100,1000,100)
N_FEATURES_OPTIONS = [15,20,25,30]
C_OPTIONS = np.logspace(-2, 7, 10)
n_estimators_opt = range(25,75,25)
max_depth_opt = range(2,4,1)
RANDOM_STATES = [0,5,42]

#Define the parameter grid for experimentation
#Joins together all appropriate combination of scaling, reduction/selection, and classification
param_grid = [
    #SVC Classifier options
    {
        'scaler': scalers,
        'reduce_dim': [PCA(iterated_power=7)],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify': classifiers[:2],
        'classify__C': C_OPTIONS
    },
    {
        'scaler': scalers,
        'reduce_dim': [SelectKBest(f_classif)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify': classifiers[:2],
        'classify__C': C_OPTIONS
    },
    #Gaussian Process Classifier Option
    {
        'scaler': scalers,
        'reduce_dim': [PCA(iterated_power=7)],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify': classifiers[3:],
        'classify__random_state': RANDOM_STATES
    },
    {
        'scaler': scalers,
        'reduce_dim': [SelectKBest(f_classif)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify': classifiers[3:],
        'classify__random_state': RANDOM_STATES
    },
    #Random Forest Classifier Options
    {
        'scaler': scalers,
        'reduce_dim': [PCA(iterated_power=7)],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify': classifiers[2:3],
        'classify__n_estimators': n_estimators_opt,
        'classify__max_depth': max_depth_opt,
    },
    {
        'scaler': scalers,
        'reduce_dim': [SelectKBest(f_classif)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify': classifiers[2:3],
        'classify__n_estimators': n_estimators_opt,
        'classify__max_depth': max_depth_opt,
    },
]

#Define scoring metric
scoring = {'Accuracy': make_scorer(accuracy_score)}

#Create 5-fold classification cross validation grid
grid = GridSearchCV(pipe, cv=5, param_grid=param_grid,
                   scoring=scoring, refit='Accuracy',
                    return_train_score=True)

with ignore_warnings(category=ConvergenceWarning):
    grid.fit(X_train, y_train)

print("BEST CV SCORE:", grid.best_score_)
print("BEST CONFIGURATION:")
print(grid.best_params_)

#Save results into a dataframe locally
results_df = pd.DataFrame(grid.cv_results_)
results_df.to_pickle('results_df.pkl')
BEST CV SCORE: 0.98
BEST CONFIGURATION:
{'classify': SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), 'classify__C': 10.0, 'reduce_dim': SelectKBest(k=15, score_func=<function f_classif at 0x7fa621afeea0>), 'reduce_dim__k': 15, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}
CPU times: user 2min 40s, sys: 3min 19s, total: 5min 59s
Wall time: 1min 13s
In [1040]:
#Outputting final results for train and test

print("FINAL TRAIN SCORE:", grid.best_estimator_.score(X,y))
y_pred = grid.best_estimator_.predict(X)
df_train['label'] = y_pred

final_train_results = df_train.copy()
final_train_results['label'] = final_train_results['label'].map(lambda x: mapping_index[x])
final_train_results.to_csv('trainfile_output.txt',sep=' ',index=False,header=False)
final_train_results.groupby('label').size().plot(kind='bar')
plt.show()


print("FINAL TEST CLASSIFICATION:")
y_test = grid.best_estimator_.predict(X_test)
df_test['label'] = y_test
df_testf['label'] = y_test

final_test_results = df_test.copy()
final_test_results['label'] = final_test_results['label'].map(lambda x: mapping_index[x])
final_test_results.to_csv('testfile_output.txt',sep=' ',index=False,header=False)
final_test_results.groupby('label').size().plot(kind='bar')
plt.show()
FINAL TRAIN SCORE: 1.0
FINAL TRAIN CLASSIFICATION:

6. Communicating Results

  • Discriminative Feature Correlation Plot
  • 2D Representation of Classification Results for training and test
  • plots to show model metrics across various classifiers
In [866]:
results_df = pd.read_pickle('results_df.pkl')
In [867]:
#Getting names of various components of the pipeline
results_df['param_classify_name'] = results_df['param_classify'].map(lambda x: get_name(x))
results_df['param_reduce_dim_name'] = results_df['param_reduce_dim'].map(lambda x: get_name(x))
results_df['param_scaler_name'] = results_df['param_scaler'].map(lambda x: get_name(x))
results_df['param_reduce_dim__n_components'] = results_df['param_reduce_dim__n_components']\
        .fillna(results_df['param_reduce_dim__k'])
results_df['param_preprocess_name'] = results_df[['param_scaler_name','param_reduce_dim_name']]\
                .apply(lambda x: '+'.join(x), axis=1)
In [971]:
#Printing top results for each combination of scaler, reducer, and classifier
results_df.groupby(['param_scaler_name',
        'param_reduce_dim_name',
        'param_classify_name'])['mean_test_Accuracy'].max().reset_index().sort_values('mean_test_Accuracy',ascending=False)
Out[971]:
param_scaler_name param_reduce_dim_name param_classify_name mean_test_Accuracy
7 MinMaxScaler SelectKBest SVC 0.98
15 StandardScaler SelectKBest SVC 0.98
4 MinMaxScaler SelectKBest GaussianProcessClassifier 0.97
5 MinMaxScaler SelectKBest LinearSVC 0.96
12 StandardScaler SelectKBest GaussianProcessClassifier 0.96
13 StandardScaler SelectKBest LinearSVC 0.95
1 MinMaxScaler PCA LinearSVC 0.92
3 MinMaxScaler PCA SVC 0.92
9 StandardScaler PCA LinearSVC 0.92
0 MinMaxScaler PCA GaussianProcessClassifier 0.91
11 StandardScaler PCA SVC 0.89
6 MinMaxScaler SelectKBest RandomForestClassifier 0.88
14 StandardScaler SelectKBest RandomForestClassifier 0.84
10 StandardScaler PCA RandomForestClassifier 0.82
2 MinMaxScaler PCA RandomForestClassifier 0.76
8 StandardScaler PCA GaussianProcessClassifier 0.47
In [884]:
#Accuracy Distribution
sns.set_style('whitegrid')
sns.despine(left=True)
sns.set_context("talk")

fig, ax = plt.subplots((1), figsize=(24,8))
for metric in ['mean_test_Accuracy']:
    sns.distplot(results_df[metric],label=metric, ax=ax)

ax.set_xlabel('mean_test_accuracy')
ax.set_ylabel('Probability Density')

plt.show()
<Figure size 1152x864 with 0 Axes>
In [883]:
#Accuracy Distribution for each PreProcesser
fig, ax = plt.subplots((1), figsize=(24,8))
for label in results_df['param_preprocess_name'].unique():
    print(label)
    mask = results_df['param_preprocess_name']==label
#     sns.distplot(results_df[mask]['mean_test_Precision'], bins=5,
#                  kde=True, rug=False, norm_hist=True,label=label, ax=ax)
    sns.kdeplot(results_df[mask]['mean_test_Accuracy'],shade=True,label=label, ax=ax)

ax.set_xlabel('mean_test_accuracy')
ax.set_ylabel('Probability Density')
plt.show()
MinMaxScaler+PCA
StandardScaler+PCA
MinMaxScaler+SelectKBest
StandardScaler+SelectKBest
In [882]:
#Accuracy Distributions for each classifier
fig, ax = plt.subplots((1), figsize=(24,8))
for label in results_df['param_classify_name'].unique():
    print(label)
    mask = results_df['param_classify_name']==label

    sns.kdeplot(results_df[mask]['mean_test_Accuracy'],shade=True,label=label, ax=ax)

ax.set_xlabel('mean_test_accuracy')
ax.set_ylabel('Probability Density')
plt.show()
LinearSVC
SVC
GaussianProcessClassifier
RandomForestClassifier
In [881]:
#Detailed Box Chart
plt.rcParams['figure.figsize'] = [16,12]
sns.set_style('whitegrid')
sns.despine(left=True)
sns.set_context("talk")

g = sns.catplot(x="mean_test_Accuracy", y="param_preprocess_name", row="param_classify_name",
#                 col='param_scaler_name',param_preprocess_name
                hue="param_reduce_dim__n_components",
                kind="box", orient="h", height=6.0, aspect=4,
                data=results_df,
               palette="Set2",dodge=True)

plt.show()
<Figure size 1152x864 with 0 Axes>