This is a follow up attempt at Kaggle's Predicting Red Hat Business Value competition.
See my notebooks section for links to the first attempt and other kaggle competitions.
The focus of this iteration is exploring whether we can bring back the previously ignored categorical columns that have hundreds if not thousands of unique values, making it impractical to use one-hot encoding.
Two approaches are taken on categorical variables with a large amount of unique values:
The end results: reincluding the columns boosted performance on the training set by only 0.5%, and surprisingly the binary / one-hot combo did hardly any better than the ordinal encoding.
import pandas as pd
people = pd.read_csv('people.csv.zip')
people.head(3)
people_id | char_1 | group_1 | char_2 | date | char_3 | char_4 | char_5 | char_6 | char_7 | ... | char_29 | char_30 | char_31 | char_32 | char_33 | char_34 | char_35 | char_36 | char_37 | char_38 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ppl_100 | type 2 | group 17304 | type 2 | 2021-06-29 | type 5 | type 5 | type 5 | type 3 | type 11 | ... | False | True | True | False | False | True | True | True | False | 36 |
1 | ppl_100002 | type 2 | group 8688 | type 3 | 2021-01-06 | type 28 | type 9 | type 5 | type 3 | type 11 | ... | False | True | True | True | True | True | True | True | False | 76 |
2 | ppl_100003 | type 2 | group 33592 | type 3 | 2022-06-10 | type 4 | type 8 | type 5 | type 2 | type 5 | ... | False | False | True | True | True | True | False | True | True | 99 |
3 rows × 41 columns
actions = pd.read_csv('act_train.csv.zip')
actions.head(3)
people_id | activity_id | date | activity_category | char_1 | char_2 | char_3 | char_4 | char_5 | char_6 | char_7 | char_8 | char_9 | char_10 | outcome | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ppl_100 | act2_1734928 | 2023-08-26 | type 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | type 76 | 0 |
1 | ppl_100 | act2_2434093 | 2022-09-27 | type 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | type 1 | 0 |
2 | ppl_100 | act2_3404049 | 2022-09-27 | type 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | type 1 | 0 |
training_data_full = pd.merge(actions, people, how='inner', on='people_id', suffixes=['_action', '_person'], sort=False)
training_data_full.head(5)
people_id | activity_id | date_action | activity_category | char_1_action | char_2_action | char_3_action | char_4_action | char_5_action | char_6_action | ... | char_29 | char_30 | char_31 | char_32 | char_33 | char_34 | char_35 | char_36 | char_37 | char_38 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ppl_100 | act2_1734928 | 2023-08-26 | type 4 | NaN | NaN | NaN | NaN | NaN | NaN | ... | False | True | True | False | False | True | True | True | False | 36 |
1 | ppl_100 | act2_2434093 | 2022-09-27 | type 2 | NaN | NaN | NaN | NaN | NaN | NaN | ... | False | True | True | False | False | True | True | True | False | 36 |
2 | ppl_100 | act2_3404049 | 2022-09-27 | type 2 | NaN | NaN | NaN | NaN | NaN | NaN | ... | False | True | True | False | False | True | True | True | False | 36 |
3 | ppl_100 | act2_3651215 | 2023-08-04 | type 2 | NaN | NaN | NaN | NaN | NaN | NaN | ... | False | True | True | False | False | True | True | True | False | 36 |
4 | ppl_100 | act2_4109017 | 2023-08-26 | type 2 | NaN | NaN | NaN | NaN | NaN | NaN | ... | False | True | True | False | False | True | True | True | False | 36 |
5 rows × 55 columns
(actions.shape, people.shape, training_data_full.shape)
((2197291, 15), (189118, 41), (2197291, 55))
Notice the new OmniEncoder
transformer and read more about its development in my learning log.
# %load "preprocessing_transforms.py"
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd
import heapq
import numpy as np
class BaseTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X, **transform_params):
return self
class ColumnSelector(BaseTransformer):
"""Selects columns from Pandas Dataframe"""
def __init__(self, columns, c_type=None):
self.columns = columns
self.c_type = c_type
def transform(self, X, **transform_params):
cs = X[self.columns]
if self.c_type is None:
return cs
else:
return cs.astype(self.c_type)
class OmniEncoder(BaseTransformer):
"""
Encodes a categorical variable using no more than k columns. As many values as possible
are one-hot encoded, the remaining are fit within a binary encoded set of columns.
If necessary some are dropped (e.g if (#unique_values) > 2^k).
In deciding which values to one-hot encode, those that appear more frequently are
preferred.
"""
def __init__(self, max_cols=20):
self.column_infos = {}
self.max_cols = max_cols
if max_cols < 3 or max_cols > 100:
raise ValueError("max_cols {} not within range(3, 100)".format(max_cols))
def fit(self, X, y=None, **fit_params):
self.column_infos = {col: self._column_info(X[col], self.max_cols) for col in X.columns}
return self
def transform(self, X, **transform_params):
return pd.concat(
[self._encode_column(X[col], self.max_cols, *self.column_infos[col]) for col in X.columns],
axis=1
)
@staticmethod
def _encode_column(col, max_cols, one_hot_vals, binary_encoded_vals):
num_one_hot = len(one_hot_vals)
num_bits = max_cols - num_one_hot if len(binary_encoded_vals) > 0 else 0
# http://stackoverflow.com/a/29091970/231589
zero_base = ord('0')
def i_to_bit_array(i):
return np.fromstring(
np.binary_repr(i, width=num_bits),
'u1'
) - zero_base
binary_val_to_bit_array = {val: i_to_bit_array(idx + 1) for idx, val in enumerate(binary_encoded_vals)}
bit_cols = [np.binary_repr(2 ** i, width=num_bits) for i in reversed(range(num_bits))]
col_names = ["{}_{}".format(col.name, val) for val in one_hot_vals] + ["{}_{}".format(col.name, bit_col) for bit_col in bit_cols]
zero_bits = np.zeros(num_bits, dtype=np.int)
def splat(v):
v_one_hot = [1 if v == ohv else 0 for ohv in one_hot_vals]
v_bits = binary_val_to_bit_array.get(v, zero_bits)
return pd.Series(np.concatenate([v_one_hot, v_bits]))
df = col.apply(splat)
df.columns = col_names
return df
@staticmethod
def _column_info(col, max_cols):
"""
:param col: pd.Series
:return: {'val': 44, 'val2': 4, ...}
"""
val_counts = dict(col.value_counts())
num_one_hot = OmniEncoder._num_onehot(len(val_counts), max_cols)
return OmniEncoder._partition_one_hot(val_counts, num_one_hot)
@staticmethod
def _partition_one_hot(val_counts, num_one_hot):
"""
Paritions the values in val counts into a list of values that should be
one-hot encoded and a list of values that should be binary encoded.
The `num_one_hot` most popular values are chosen to be one-hot encoded.
:param val_counts: {'val': 433}
:param num_one_hot: the number of elements to be one-hot encoded
:return: ['val1', 'val2'], ['val55', 'val59']
"""
one_hot_vals = [k for (k, count) in heapq.nlargest(num_one_hot, val_counts.items(), key=lambda t: t[1])]
one_hot_vals_lookup = set(one_hot_vals)
bin_encoded_vals = [val for val in val_counts if val not in one_hot_vals_lookup]
return sorted(one_hot_vals), sorted(bin_encoded_vals)
@staticmethod
def _num_onehot(n, k):
"""
Determines the number of onehot columns we can have to encode n values
in no more than k columns, assuming we will binary encode the rest.
:param n: The number of unique values to encode
:param k: The maximum number of columns we have
:return: The number of one-hot columns to use
"""
num_one_hot = min(n, k)
def num_bin_vals(num):
if num == 0:
return 0
return 2 ** num - 1
def capacity(oh):
"""
Capacity given we are using `oh` one hot columns.
"""
return oh + num_bin_vals(k - oh)
while capacity(num_one_hot) < n and num_one_hot > 0:
num_one_hot -= 1
return num_one_hot
class EncodeCategorical(BaseTransformer):
def __init__(self):
self.categorical_vals = {}
def fit(self, X, y=None, **fit_params):
self.categorical_vals = {col: {label: idx + 1 for idx, label in enumerate(sorted(X[col].dropna().unique()))} for
col in X.columns}
return self
def transform(self, X, **transform_params):
return pd.concat(
[X[col].map(self.categorical_vals[col]) for col in X.columns],
axis=1
)
class SpreadBinary(BaseTransformer):
def transform(self, X, **transform_params):
return X.applymap(lambda x: 1 if x == 1 else -1)
class DfTransformerAdapter(BaseTransformer):
"""Adapts a scikit-learn Transformer to return a pandas DataFrame"""
def __init__(self, transformer):
self.transformer = transformer
def fit(self, X, y=None, **fit_params):
self.transformer.fit(X, y=y, **fit_params)
return self
def transform(self, X, **transform_params):
raw_result = self.transformer.transform(X, **transform_params)
return pd.DataFrame(raw_result, columns=X.columns, index=X.index)
class DfOneHot(BaseTransformer):
"""
Wraps helper method `get_dummies` making sure all columns get one-hot encoded.
"""
def __init__(self):
self.dummy_columns = []
def fit(self, X, y=None, **fit_params):
self.dummy_columns = pd.get_dummies(
X,
prefix=[c for c in X.columns],
columns=X.columns).columns
return self
def transform(self, X, **transform_params):
return pd.get_dummies(
X,
prefix=[c for c in X.columns],
columns=X.columns).reindex(columns=self.dummy_columns, fill_value=0)
class DfFeatureUnion(BaseTransformer):
"""A dataframe friendly implementation of `FeatureUnion`"""
def __init__(self, transformers):
self.transformers = transformers
def fit(self, X, y=None, **fit_params):
for l, t in self.transformers:
t.fit(X, y=y, **fit_params)
return self
def transform(self, X, **transform_params):
transform_results = [t.transform(X, **transform_params) for l, t in self.transformers]
return pd.concat(transform_results, axis=1)
for col in training_data_full.columns:
print("in {} there are {} unique values".format(col, len(training_data_full[col].unique())))
None
in people_id there are 151295 unique values in activity_id there are 2197291 unique values in date_action there are 411 unique values in activity_category there are 7 unique values in char_1_action there are 52 unique values in char_2_action there are 33 unique values in char_3_action there are 12 unique values in char_4_action there are 8 unique values in char_5_action there are 8 unique values in char_6_action there are 6 unique values in char_7_action there are 9 unique values in char_8_action there are 19 unique values in char_9_action there are 20 unique values in char_10_action there are 6516 unique values in outcome there are 2 unique values in char_1_person there are 2 unique values in group_1 there are 29899 unique values in char_2_person there are 3 unique values in date_person there are 1196 unique values in char_3_person there are 43 unique values in char_4_person there are 25 unique values in char_5_person there are 9 unique values in char_6_person there are 7 unique values in char_7_person there are 25 unique values in char_8_person there are 8 unique values in char_9_person there are 9 unique values in char_10_person there are 2 unique values in char_11 there are 2 unique values in char_12 there are 2 unique values in char_13 there are 2 unique values in char_14 there are 2 unique values in char_15 there are 2 unique values in char_16 there are 2 unique values in char_17 there are 2 unique values in char_18 there are 2 unique values in char_19 there are 2 unique values in char_20 there are 2 unique values in char_21 there are 2 unique values in char_22 there are 2 unique values in char_23 there are 2 unique values in char_24 there are 2 unique values in char_25 there are 2 unique values in char_26 there are 2 unique values in char_27 there are 2 unique values in char_28 there are 2 unique values in char_29 there are 2 unique values in char_30 there are 2 unique values in char_31 there are 2 unique values in char_32 there are 2 unique values in char_33 there are 2 unique values in char_34 there are 2 unique values in char_35 there are 2 unique values in char_36 there are 2 unique values in char_37 there are 2 unique values in char_38 there are 101 unique values
Notice that char_10_action, group_1 and others have a ton of unique values; one-hot encoding will result in a dataframe with thousands of columns.
Let's explore 3 approaches to dealing with categorical columns with a lot of unique values and compare performance:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler
cat_columns = ['activity_category',
'char_1_action', 'char_2_action', 'char_3_action', 'char_4_action',
'char_5_action', 'char_6_action', 'char_7_action', 'char_8_action',
'char_9_action', 'char_1_person',
'char_2_person', 'char_3_person',
'char_4_person', 'char_5_person', 'char_6_person', 'char_7_person',
'char_8_person', 'char_9_person', 'char_10_person', 'char_11',
'char_12', 'char_13', 'char_14', 'char_15', 'char_16', 'char_17',
'char_18', 'char_19', 'char_20', 'char_21', 'char_22', 'char_23',
'char_24', 'char_25', 'char_26', 'char_27', 'char_28', 'char_29',
'char_30', 'char_31', 'char_32', 'char_33', 'char_34', 'char_35',
'char_36', 'char_37']
high_dim_cat_columns = ['date_action', 'char_10_action', 'group_1', 'date_person']
q_columns = ['char_38']
preprocessor_ignore = Pipeline([
('features', DfFeatureUnion([
('quantitative', Pipeline([
('select-quantitative', ColumnSelector(q_columns, c_type='float')),
('impute-missing', DfTransformerAdapter(Imputer(strategy='median'))),
('scale', DfTransformerAdapter(StandardScaler()))
])),
('categorical', Pipeline([
('select-categorical', ColumnSelector(cat_columns)),
('apply-onehot', DfOneHot()),
('spread-binary', SpreadBinary())
])),
]))
])
preprocessor_lexico = Pipeline([
('features', DfFeatureUnion([
('quantitative', Pipeline([
('combine-q', DfFeatureUnion([
('highd', Pipeline([
('select-highd', ColumnSelector(high_dim_cat_columns)),
('encode-highd', EncodeCategorical())
])),
('select-quantitative', ColumnSelector(q_columns, c_type='float')),
])),
('impute-missing', DfTransformerAdapter(Imputer(strategy='median'))),
('scale', DfTransformerAdapter(StandardScaler()))
])),
('categorical', Pipeline([
('select-categorical', ColumnSelector(cat_columns)),
('apply-onehot', DfOneHot()),
('spread-binary', SpreadBinary())
])),
]))
])
preprocessor_omni_20 = Pipeline([
('features', DfFeatureUnion([
('quantitative', Pipeline([
('select-quantitative', ColumnSelector(q_columns, c_type='float')),
('impute-missing', DfTransformerAdapter(Imputer(strategy='median'))),
('scale', DfTransformerAdapter(StandardScaler()))
])),
('categorical', Pipeline([
('select-categorical', ColumnSelector(cat_columns + high_dim_cat_columns)),
('apply-onehot', OmniEncoder(max_cols=20)),
('spread-binary', SpreadBinary())
])),
]))
])
preprocessor_omni_50 = Pipeline([
('features', DfFeatureUnion([
('quantitative', Pipeline([
('select-quantitative', ColumnSelector(q_columns, c_type='float')),
('impute-missing', DfTransformerAdapter(Imputer(strategy='median'))),
('scale', DfTransformerAdapter(StandardScaler()))
])),
('categorical', Pipeline([
('select-categorical', ColumnSelector(cat_columns + high_dim_cat_columns)),
('apply-onehot', OmniEncoder(max_cols=50)),
('spread-binary', SpreadBinary())
])),
]))
])
If we train models based on the entire test dataset provided it exhausts the memory on my laptop. Again, in the spirit of getting something quick and dirty working, we'll sample the dataset and train on that. We'll then evaluate our model by testing the accuracy on a larger sample.
from sklearn.cross_validation import train_test_split
training_frac = 0.01
test_frac = 0.05
training_data, the_rest = train_test_split(training_data_full, train_size=training_frac, random_state=0)
test_data = the_rest.sample(frac=test_frac / (1-training_frac))
training_data.shape
(21972, 55)
test_data.shape
(109865, 55)
Some utilities to make reporting progress easier
import time
import subprocess
class time_and_log():
def __init__(self, label, *, prefix='', say=False):
self.label = label
self.prefix = prefix
self.say = say
def __enter__(self):
msg = 'Starting {}'.format(self.label)
print('{}{}'.format(self.prefix, msg))
if self.say:
cmd_say(msg)
self.start = time.process_time()
return self
def __exit__(self, *exc):
self.interval = time.process_time() - self.start
msg = 'Finished {} in {:.2f} seconds'.format(self.label, self.interval)
print('{}{}'.format(self.prefix, msg))
if self.say:
cmd_say(msg)
return False
def cmd_say(msg):
subprocess.call("say '{}'".format(msg), shell=True)
with time_and_log('wrangling training data', say=True, prefix=" _"):
wrangled = preprocessor_omni_20.fit_transform(training_data)
_Starting wrangling training data _Finished wrangling training data in 383.88 seconds
wrangled.head()
char_38 | activity_category_type 1 | activity_category_type 2 | activity_category_type 3 | activity_category_type 4 | activity_category_type 5 | activity_category_type 6 | activity_category_type 7 | char_1_action_type 1 | char_1_action_type 10 | ... | date_person_01000000000 | date_person_00100000000 | date_person_00010000000 | date_person_00001000000 | date_person_00000100000 | date_person_00000010000 | date_person_00000001000 | date_person_00000000100 | date_person_00000000010 | date_person_00000000001 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1119692 | -0.413876 | -1 | -1 | -1 | -1 | 1 | -1 | -1 | -1 | -1 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
331126 | 0.332410 | -1 | 1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | ... | 1 | -1 | -1 | 1 | -1 | -1 | 1 | 1 | 1 | -1 |
424011 | -0.192754 | -1 | -1 | -1 | -1 | 1 | -1 | -1 | -1 | -1 | ... | 1 | 1 | 1 | -1 | -1 | -1 | -1 | -1 | 1 | 1 |
341796 | 0.000727 | -1 | 1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
22692 | 0.885214 | -1 | -1 | 1 | -1 | -1 | -1 | -1 | -1 | -1 | ... | 1 | 1 | -1 | -1 | -1 | 1 | 1 | 1 | -1 | -1 |
5 rows × 354 columns
from sklearn.ensemble import RandomForestClassifier
pipe_rf_ignore = Pipeline([
('wrangle', preprocessor_ignore),
('rf', RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0))
])
pipe_rf_lexico = Pipeline([
('wrangle', preprocessor_lexico),
('rf', RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0))
])
pipe_rf_omni_20 = Pipeline([
('wrangle', preprocessor_omni_20),
('rf', RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0))
])
pipe_rf_omni_50 = Pipeline([
('wrangle', preprocessor_omni_50),
('rf', RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0))
])
feature_columns = cat_columns + q_columns + high_dim_cat_columns
def extract_X_y(df):
return df[feature_columns], df['outcome']
X_train, y_train = extract_X_y(training_data)
X_test, y_test = extract_X_y(test_data)
We'll cross validate within the training set, and then train on the full training set and see how well it performs on the full test set.
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
import numpy as np
models = [
('random forest ignore', pipe_rf_ignore),
('random forest ordinal', pipe_rf_lexico),
('random forest omni 20', pipe_rf_omni_20),
('random forest omni 50', pipe_rf_omni_50),
]
for label, model in models:
print('Evaluating {}'.format(label))
cmd_say('Evaluating {}'.format(label))
# with time_and_log('cross validating', say=True, prefix=" _"):
# scores = cross_val_score(estimator=model,
# X=X_train,
# y=y_train,
# cv=5,
# n_jobs=1)
# print(' CV accuracy: {:.3f} +/- {:.3f}'.format(np.mean(scores), np.std(scores)))
with time_and_log('fitting full training set', say=True, prefix=" _"):
model.fit(X_train, y_train)
with time_and_log('evaluating on full test set', say=True, prefix=" _"):
print(" Full test accuracy ({:.2f} of dataset): {:.3f}".format(
test_frac,
accuracy_score(y_test, model.predict(X_test))))
Evaluating random forest ignore _Starting fitting full training set _Finished fitting full training set in 3.86 seconds _Starting evaluating on full test set Full test accuracy (0.05 of dataset): 0.880 _Finished evaluating on full test set in 16.32 seconds Evaluating random forest ordinal _Starting fitting full training set _Finished fitting full training set in 4.26 seconds _Starting evaluating on full test set Full test accuracy (0.05 of dataset): 0.885 _Finished evaluating on full test set in 16.10 seconds Evaluating random forest omni 20 _Starting fitting full training set _Finished fitting full training set in 376.31 seconds _Starting evaluating on full test set Full test accuracy (0.05 of dataset): 0.885 _Finished evaluating on full test set in 1050.23 seconds Evaluating random forest omni 50 _Starting fitting full training set _Finished fitting full training set in 417.19 seconds _Starting evaluating on full test set Full test accuracy (0.05 of dataset): 0.886 _Finished evaluating on full test set in 1102.41 seconds