License¶

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

DISCLAIMER: This notebook is not legal or compliance advice.

Model Evaluation Notebook¶

Imports and inits¶

In [1]:

import os              # for directory and file manipulation
import numpy as np     # for basic array manipulation
import pandas as pd    # for dataframe manipulation
import datetime        # for timestamp

# for model eval
from sklearn.metrics import accuracy_score, f1_score, log_loss, mean_squared_error, roc_auc_score

# global constants 
ROUND = 3              # generally, insane precision is not needed 
SEED = 12345           # seed for better reproducibility

# set global random seed for better reproducibility
np.random.seed(SEED)

Set basic metadata¶

In [2]:

y_name = 'high_priced'
scores_dir = 'data/scores'

Read in score files¶

In [3]:

# init score frame with known test y values
scores_frame = pd.read_csv(scores_dir + os.sep +'key.csv', index_col='Unnamed: 0')

# create random folds in reproducible way
np.random.seed(SEED)
scores_frame['fold'] = np.random.choice(5, scores_frame.shape[0])

# read in each score file in the directory as a new column 
for file in sorted(os.listdir(scores_dir)):
    if file != 'key.csv' and file.endswith('.csv'):
        scores_frame[file[:-4]] = pd.read_csv(scores_dir + os.sep + file)['phat']

# sanity check 
scores_frame

Out[3]:

	high_priced	fold	group1_rem_ebm	group2_rem_ebm	group2_rem_ebm2	group3_rem_piml_EBM	group3_rem_piml_EBM2	group5_rem_xgb2	group8_rem_ebm	group9_rem_xgb	ph_rem_ebm
0	0.0	2	0.118787	0.080557	0.080557	0.920389	0.136749	0.078326	0.223846	0.081792	0.219429
1	0.0	1	0.084506	0.026001	0.026001	0.969301	0.053751	0.035825	0.053926	0.110702	0.053929
2	1.0	4	0.210389	0.194961	0.194961	0.814272	0.182311	0.195332	0.143522	0.204048	0.133863
3	0.0	1	0.008529	0.028556	0.028556	0.974559	0.004065	0.022765	0.009371	0.024038	0.014419
4	1.0	2	0.189933	0.208263	0.208263	0.802908	0.211120	0.193035	0.151100	0.170243	0.156047
...	...	...	...	...	...	...	...	...	...	...	...
19826	0.0	3	0.163697	0.228342	0.228342	0.792251	0.209322	0.235192	0.216720	0.181403	0.184214
19827	0.0	1	0.114999	0.253998	0.253998	0.762946	0.206744	0.235832	0.161401	0.159468	0.141663
19828	1.0	3	0.141307	0.213364	0.213364	0.747401	0.246610	0.208723	0.242814	0.138141	0.233266
19829	0.0	1	0.007766	0.002176	0.002176	0.996455	0.000268	0.018702	0.005657	0.034570	0.009914
19830	0.0	0	0.163946	0.185484	0.185484	0.811429	0.177857	0.215085	0.167812	0.177785	0.155447

19831 rows × 11 columns

Utility function for max. accuracy¶

In [4]:

def max_acc(y, phat, res=0.01): 

    """ Utility function for finding max. accuracy at some cutoff. 
    
        :param y: Known y values.
        :param phat: Model scores.
        :param res: Resolution over which to search for max. accuracy, default 0.01.
        :return: Max. accuracy for model scores.
    
    """
    
    # init frame to store acc at different cutoffs
    acc_frame = pd.DataFrame(columns=['cut', 'acc'])
    
    # copy known y and score values into a temporary frame
    temp_df = pd.concat([y, phat], axis=1)
    
    # find accuracy at different cutoffs and store in acc_frame
    for cut in np.arange(0, 1 + res, res):
        temp_df['decision'] = np.where(temp_df.iloc[:, 1] > cut, 1, 0)
        acc = accuracy_score(temp_df.iloc[:, 0], temp_df['decision'])
        acc_frame = acc_frame.append({'cut': cut,
                                      'acc': acc},
                                     ignore_index=True)

    # find max accurcay across all cutoffs
    max_acc = acc_frame['acc'].max()
    
    # house keeping
    del acc_frame, temp_df
    
    return max_acc

Utility function for max. F1¶

In [5]:

def max_f1(y, phat, res=0.01): 
    
    """ Utility function for finding max. F1 at some cutoff. 
    
        :param y: Known y values.
        :param phat: Model scores.
        :param res: Resolution over which to search for max. F1, default 0.01.
        :return: Max. F1 for model scores.
    
    """
    
    # init frame to store f1 at different cutoffs
    f1_frame = pd.DataFrame(columns=['cut', 'f1'])
    
    # copy known y and score values into a temporary frame
    temp_df = pd.concat([y, phat], axis=1)
    
    # find f1 at different cutoffs and store in acc_frame
    for cut in np.arange(0, 1 + res, res):
        temp_df['decision'] = np.where(temp_df.iloc[:, 1] > cut, 1, 0)
        f1 = f1_score(temp_df.iloc[:, 0], temp_df['decision'])
        f1_frame = f1_frame.append({'cut': cut,
                                    'f1': f1},
                                    ignore_index=True)
        
    # find max f1 across all cutoffs
    max_f1 = f1_frame['f1'].max()
    
     # house keeping
    del f1_frame, temp_df
    
    return max_f1

Rank all submitted scores¶

In [6]:

eval_frame = pd.DataFrame() # init frame to hold score ranking
metric_list = ['acc', 'auc', 'f1', 'logloss', 'mse'] # metric to use for evaluation

# create eval frame row-by-row
for fold in sorted(scores_frame['fold'].unique()): # loop through folds 
    for metric_name in metric_list: # loop through metrics
        
        # init row dict to hold each rows values
        row_dict = {'fold': fold,
                    'metric': metric_name}
        
        # cache known y values for fold
        fold_y = scores_frame.loc[scores_frame['fold'] == fold, y_name]
        
        for col_name in scores_frame.columns[2:]:
            
            # cache fold scores
            fold_scores = scores_frame.loc[scores_frame['fold'] == fold, col_name]
            
            # calculate evaluation metric for fold
            # with reasonable precision 
            
            if metric_name == 'acc':
                row_dict[col_name] = np.round(max_acc(fold_y, fold_scores), ROUND)
                
            if metric_name == 'auc':
                row_dict[col_name] = np.round(roc_auc_score(fold_y, fold_scores), ROUND)
                
            if metric_name == 'f1':
                row_dict[col_name] = np.round(max_f1(fold_y, fold_scores), ROUND) 
                
            if metric_name == 'logloss':
                row_dict[col_name] = np.round(log_loss(fold_y, fold_scores), ROUND)
                
            if metric_name == 'mse':
                row_dict[col_name] = np.round(mean_squared_error(fold_y, fold_scores), ROUND)
        
        # append row values to eval_frame
        eval_frame = eval_frame.append(row_dict, ignore_index=True)

# init a temporary frame to hold rank information
rank_names = [name + '_rank' for name in sorted(eval_frame.columns) if name not in ['fold', 'metric']]
rank_frame = pd.DataFrame(columns=rank_names)        

# set columns to necessary order
eval_frame = eval_frame[['fold', 'metric'] + [name for name in sorted(eval_frame.columns) if name not in ['fold', 'metric']]]

# determine score ranks row-by-row
for i in range(0, eval_frame.shape[0]):
        
        # get ranks for row based on metric
        metric_name = eval_frame.loc[i, 'metric']
        if metric_name in ['logloss', 'mse']:
            ranks = eval_frame.iloc[i, 2:].rank().values
        else:
            ranks = eval_frame.iloc[i, 2:].rank(ascending=False).values
        
        # create single-row frame and append to rank_frame
        row_frame = pd.DataFrame(ranks.reshape(1, ranks.shape[0]), columns=rank_names)
        rank_frame = rank_frame.append(row_frame, ignore_index=True)
        
        # house keeping
        del row_frame

# merge ranks onto eval_frame
eval_frame = pd.concat([eval_frame, rank_frame], axis=1)

# house keeping
del rank_frame
        
eval_frame

Out[6]:

	fold	metric	group1_rem_ebm	group2_rem_ebm	group2_rem_ebm2	group3_rem_piml_EBM	group3_rem_piml_EBM2	group5_rem_xgb2	group8_rem_ebm	group9_rem_xgb	ph_rem_ebm	group1_rem_ebm_rank	group2_rem_ebm_rank	group2_rem_ebm2_rank	group3_rem_piml_EBM_rank	group3_rem_piml_EBM2_rank	group5_rem_xgb2_rank	group8_rem_ebm_rank	group9_rem_xgb_rank	ph_rem_ebm_rank
0	0.0	acc	0.900	0.901	0.901	0.900	0.901	0.901	0.901	0.900	0.901	8.0	3.5	3.5	8.0	3.5	3.5	3.5	8.0	3.5
1	0.0	auc	0.781	0.840	0.840	0.163	0.821	0.836	0.793	0.797	0.791	8.0	1.5	1.5	9.0	4.0	3.0	6.0	5.0	7.0
2	0.0	f1	0.347	0.405	0.405	0.182	0.381	0.392	0.342	0.357	0.347	6.5	1.5	1.5	9.0	4.0	3.0	8.0	5.0	6.5
3	0.0	logloss	0.280	0.251	0.251	3.257	0.262	0.254	0.274	0.277	0.275	8.0	1.5	1.5	9.0	4.0	3.0	5.0	7.0	6.0
4	0.0	mse	0.082	0.077	0.077	0.773	0.078	0.077	0.081	0.081	0.081	8.0	2.0	2.0	9.0	4.0	2.0	6.0	6.0	6.0
5	1.0	acc	0.906	0.906	0.906	0.906	0.906	0.906	0.906	0.906	0.906	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0
6	1.0	auc	0.767	0.828	0.828	0.172	0.810	0.822	0.774	0.779	0.772	8.0	1.5	1.5	9.0	4.0	3.0	6.0	5.0	7.0
7	1.0	f1	0.312	0.368	0.368	0.172	0.348	0.360	0.319	0.329	0.321	8.0	1.5	1.5	9.0	4.0	3.0	7.0	5.0	6.0
8	1.0	logloss	0.272	0.246	0.246	3.253	0.258	0.250	0.270	0.271	0.272	7.5	1.5	1.5	9.0	4.0	3.0	5.0	6.0	7.5
9	1.0	mse	0.079	0.074	0.074	0.778	0.077	0.075	0.079	0.078	0.079	7.0	1.5	1.5	9.0	4.0	3.0	7.0	5.0	7.0
10	2.0	acc	0.908	0.908	0.908	0.908	0.908	0.910	0.908	0.908	0.909	6.0	6.0	6.0	6.0	6.0	1.0	6.0	6.0	2.0
11	2.0	auc	0.759	0.825	0.825	0.175	0.815	0.826	0.781	0.772	0.780	8.0	2.5	2.5	9.0	4.0	1.0	5.0	7.0	6.0
12	2.0	f1	0.304	0.372	0.372	0.169	0.354	0.371	0.315	0.320	0.323	8.0	1.5	1.5	9.0	4.0	3.0	7.0	6.0	5.0
13	2.0	logloss	0.271	0.246	0.246	3.284	0.251	0.245	0.264	0.271	0.264	7.5	2.5	2.5	9.0	4.0	1.0	5.5	7.5	5.5
14	2.0	mse	0.078	0.073	0.073	0.781	0.074	0.073	0.076	0.077	0.076	8.0	2.0	2.0	9.0	4.0	2.0	5.5	7.0	5.5
15	3.0	acc	0.903	0.903	0.903	0.903	0.903	0.903	0.903	0.903	0.903	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0
16	3.0	auc	0.772	0.826	0.826	0.174	0.809	0.823	0.775	0.786	0.772	7.5	1.5	1.5	9.0	4.0	3.0	6.0	5.0	7.5
17	3.0	f1	0.317	0.371	0.371	0.177	0.361	0.365	0.328	0.343	0.323	8.0	1.5	1.5	9.0	4.0	3.0	6.0	5.0	7.0
18	3.0	logloss	0.276	0.252	0.252	3.254	0.262	0.253	0.275	0.275	0.276	7.5	1.5	1.5	9.0	4.0	3.0	5.5	5.5	7.5
19	3.0	mse	0.081	0.077	0.077	0.775	0.079	0.077	0.080	0.080	0.080	8.0	2.0	2.0	9.0	4.0	2.0	6.0	6.0	6.0
20	4.0	acc	0.895	0.897	0.897	0.895	0.895	0.898	0.895	0.896	0.895	7.0	2.5	2.5	7.0	7.0	1.0	7.0	4.0	7.0
21	4.0	auc	0.754	0.831	0.831	0.170	0.818	0.828	0.785	0.779	0.782	8.0	1.5	1.5	9.0	4.0	3.0	5.0	7.0	6.0
22	4.0	f1	0.323	0.401	0.401	0.190	0.404	0.397	0.364	0.354	0.362	8.0	2.5	2.5	9.0	1.0	4.0	5.0	7.0	6.0
23	4.0	logloss	0.296	0.263	0.263	3.200	0.273	0.266	0.286	0.291	0.287	8.0	1.5	1.5	9.0	4.0	3.0	5.0	7.0	6.0
24	4.0	mse	0.087	0.080	0.080	0.771	0.082	0.080	0.084	0.086	0.084	8.0	2.0	2.0	9.0	4.0	2.0	5.5	7.0	5.5

Save `eval_frame` as CSV¶

In [7]:

eval_frame.to_csv('model_eval_' + str(datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + '.csv'), 
                  index=False)

Display simple ranked score list¶

In [8]:

eval_frame[[name for name in eval_frame.columns if name.endswith('rank')]].mean().sort_values()

Out[8]:

group2_rem_ebm_rank          2.28
group2_rem_ebm2_rank         2.28
group5_rem_xgb2_rank         2.74
group3_rem_piml_EBM2_rank    4.14
group8_rem_ebm_rank          5.74
group9_rem_xgb_rank          5.96
ph_rem_ebm_rank              5.96
group1_rem_ebm_rank          7.46
group3_rem_piml_EBM_rank     8.44
dtype: float64