#!/usr/bin/env python # coding: utf-8 # ## Kaggle's Predicting Red Hat Business Value # # This is a follow up attempt at Kaggle's [Predicting Red Hat Business Value](https://www.kaggle.com/c/predicting-red-hat-business-value) competition. # # See [my notebooks section](http://karlrosaen.com/ml/notebooks) for links to the first attempt and other kaggle competitions. # # The focus of this iteration is exploring whether we can bring back the previously ignored categorical columns that have hundreds if not thousands of unique values, making it impractical to use one-hot encoding. # # Two approaches are taken on categorical variables with a large amount of unique values: # # - encoding the values ordinally; sorting the values lexicographically and assigning a sequence of numbers, and then treating them quantitatively from there # - encoding the most frequently occuring values using one-hot and then binary encoding the rest. As part of this I developed a new scikit-learn transformer # # The end results: reincluding the columns boosted performance on the training set by only 0.5%, and surprisingly the binary / one-hot combo did hardly any better than the ordinal encoding. # # ### Loading in the data # In[1]: import pandas as pd people = pd.read_csv('people.csv.zip') people.head(3) # In[2]: actions = pd.read_csv('act_train.csv.zip') actions.head(3) # ## Joining together to get dataset # In[3]: training_data_full = pd.merge(actions, people, how='inner', on='people_id', suffixes=['_action', '_person'], sort=False) training_data_full.head(5) # In[4]: (actions.shape, people.shape, training_data_full.shape) # ## Building a preprocessing pipeline # # Notice the new `OmniEncoder` transformer and read more about its development in [my learning log](http://karlrosaen.com/ml/learning-log/2016-08-26/). # In[15]: # %load "preprocessing_transforms.py" from sklearn.base import TransformerMixin, BaseEstimator import pandas as pd import heapq import numpy as np class BaseTransformer(BaseEstimator, TransformerMixin): def fit(self, X, y=None, **fit_params): return self def transform(self, X, **transform_params): return self class ColumnSelector(BaseTransformer): """Selects columns from Pandas Dataframe""" def __init__(self, columns, c_type=None): self.columns = columns self.c_type = c_type def transform(self, X, **transform_params): cs = X[self.columns] if self.c_type is None: return cs else: return cs.astype(self.c_type) class OmniEncoder(BaseTransformer): """ Encodes a categorical variable using no more than k columns. As many values as possible are one-hot encoded, the remaining are fit within a binary encoded set of columns. If necessary some are dropped (e.g if (#unique_values) > 2^k). In deciding which values to one-hot encode, those that appear more frequently are preferred. """ def __init__(self, max_cols=20): self.column_infos = {} self.max_cols = max_cols if max_cols < 3 or max_cols > 100: raise ValueError("max_cols {} not within range(3, 100)".format(max_cols)) def fit(self, X, y=None, **fit_params): self.column_infos = {col: self._column_info(X[col], self.max_cols) for col in X.columns} return self def transform(self, X, **transform_params): return pd.concat( [self._encode_column(X[col], self.max_cols, *self.column_infos[col]) for col in X.columns], axis=1 ) @staticmethod def _encode_column(col, max_cols, one_hot_vals, binary_encoded_vals): num_one_hot = len(one_hot_vals) num_bits = max_cols - num_one_hot if len(binary_encoded_vals) > 0 else 0 # http://stackoverflow.com/a/29091970/231589 zero_base = ord('0') def i_to_bit_array(i): return np.fromstring( np.binary_repr(i, width=num_bits), 'u1' ) - zero_base binary_val_to_bit_array = {val: i_to_bit_array(idx + 1) for idx, val in enumerate(binary_encoded_vals)} bit_cols = [np.binary_repr(2 ** i, width=num_bits) for i in reversed(range(num_bits))] col_names = ["{}_{}".format(col.name, val) for val in one_hot_vals] + ["{}_{}".format(col.name, bit_col) for bit_col in bit_cols] zero_bits = np.zeros(num_bits, dtype=np.int) def splat(v): v_one_hot = [1 if v == ohv else 0 for ohv in one_hot_vals] v_bits = binary_val_to_bit_array.get(v, zero_bits) return pd.Series(np.concatenate([v_one_hot, v_bits])) df = col.apply(splat) df.columns = col_names return df @staticmethod def _column_info(col, max_cols): """ :param col: pd.Series :return: {'val': 44, 'val2': 4, ...} """ val_counts = dict(col.value_counts()) num_one_hot = OmniEncoder._num_onehot(len(val_counts), max_cols) return OmniEncoder._partition_one_hot(val_counts, num_one_hot) @staticmethod def _partition_one_hot(val_counts, num_one_hot): """ Paritions the values in val counts into a list of values that should be one-hot encoded and a list of values that should be binary encoded. The `num_one_hot` most popular values are chosen to be one-hot encoded. :param val_counts: {'val': 433} :param num_one_hot: the number of elements to be one-hot encoded :return: ['val1', 'val2'], ['val55', 'val59'] """ one_hot_vals = [k for (k, count) in heapq.nlargest(num_one_hot, val_counts.items(), key=lambda t: t[1])] one_hot_vals_lookup = set(one_hot_vals) bin_encoded_vals = [val for val in val_counts if val not in one_hot_vals_lookup] return sorted(one_hot_vals), sorted(bin_encoded_vals) @staticmethod def _num_onehot(n, k): """ Determines the number of onehot columns we can have to encode n values in no more than k columns, assuming we will binary encode the rest. :param n: The number of unique values to encode :param k: The maximum number of columns we have :return: The number of one-hot columns to use """ num_one_hot = min(n, k) def num_bin_vals(num): if num == 0: return 0 return 2 ** num - 1 def capacity(oh): """ Capacity given we are using `oh` one hot columns. """ return oh + num_bin_vals(k - oh) while capacity(num_one_hot) < n and num_one_hot > 0: num_one_hot -= 1 return num_one_hot class EncodeCategorical(BaseTransformer): def __init__(self): self.categorical_vals = {} def fit(self, X, y=None, **fit_params): self.categorical_vals = {col: {label: idx + 1 for idx, label in enumerate(sorted(X[col].dropna().unique()))} for col in X.columns} return self def transform(self, X, **transform_params): return pd.concat( [X[col].map(self.categorical_vals[col]) for col in X.columns], axis=1 ) class SpreadBinary(BaseTransformer): def transform(self, X, **transform_params): return X.applymap(lambda x: 1 if x == 1 else -1) class DfTransformerAdapter(BaseTransformer): """Adapts a scikit-learn Transformer to return a pandas DataFrame""" def __init__(self, transformer): self.transformer = transformer def fit(self, X, y=None, **fit_params): self.transformer.fit(X, y=y, **fit_params) return self def transform(self, X, **transform_params): raw_result = self.transformer.transform(X, **transform_params) return pd.DataFrame(raw_result, columns=X.columns, index=X.index) class DfOneHot(BaseTransformer): """ Wraps helper method `get_dummies` making sure all columns get one-hot encoded. """ def __init__(self): self.dummy_columns = [] def fit(self, X, y=None, **fit_params): self.dummy_columns = pd.get_dummies( X, prefix=[c for c in X.columns], columns=X.columns).columns return self def transform(self, X, **transform_params): return pd.get_dummies( X, prefix=[c for c in X.columns], columns=X.columns).reindex(columns=self.dummy_columns, fill_value=0) class DfFeatureUnion(BaseTransformer): """A dataframe friendly implementation of `FeatureUnion`""" def __init__(self, transformers): self.transformers = transformers def fit(self, X, y=None, **fit_params): for l, t in self.transformers: t.fit(X, y=y, **fit_params) return self def transform(self, X, **transform_params): transform_results = [t.transform(X, **transform_params) for l, t in self.transformers] return pd.concat(transform_results, axis=1) # In[7]: for col in training_data_full.columns: print("in {} there are {} unique values".format(col, len(training_data_full[col].unique()))) None # ### Potential trouble with high dimensionality # # Notice that char_10_action, group_1 and others have a ton of unique values; one-hot encoding will result in a dataframe with thousands of columns. # # Let's explore 3 approaches to dealing with categorical columns with a lot of unique values and compare performance: # # - ignore them # - encode them ordinally, mapping every unique value to a different integer (assuming some ordered value that probably doesn't exist, at least not by our default lexicographical sorting) # - encode them with a combo of one-hot and binary # # # In[16]: from sklearn.pipeline import Pipeline from sklearn.preprocessing import Imputer, StandardScaler cat_columns = ['activity_category', 'char_1_action', 'char_2_action', 'char_3_action', 'char_4_action', 'char_5_action', 'char_6_action', 'char_7_action', 'char_8_action', 'char_9_action', 'char_1_person', 'char_2_person', 'char_3_person', 'char_4_person', 'char_5_person', 'char_6_person', 'char_7_person', 'char_8_person', 'char_9_person', 'char_10_person', 'char_11', 'char_12', 'char_13', 'char_14', 'char_15', 'char_16', 'char_17', 'char_18', 'char_19', 'char_20', 'char_21', 'char_22', 'char_23', 'char_24', 'char_25', 'char_26', 'char_27', 'char_28', 'char_29', 'char_30', 'char_31', 'char_32', 'char_33', 'char_34', 'char_35', 'char_36', 'char_37'] high_dim_cat_columns = ['date_action', 'char_10_action', 'group_1', 'date_person'] q_columns = ['char_38'] preprocessor_ignore = Pipeline([ ('features', DfFeatureUnion([ ('quantitative', Pipeline([ ('select-quantitative', ColumnSelector(q_columns, c_type='float')), ('impute-missing', DfTransformerAdapter(Imputer(strategy='median'))), ('scale', DfTransformerAdapter(StandardScaler())) ])), ('categorical', Pipeline([ ('select-categorical', ColumnSelector(cat_columns)), ('apply-onehot', DfOneHot()), ('spread-binary', SpreadBinary()) ])), ])) ]) preprocessor_lexico = Pipeline([ ('features', DfFeatureUnion([ ('quantitative', Pipeline([ ('combine-q', DfFeatureUnion([ ('highd', Pipeline([ ('select-highd', ColumnSelector(high_dim_cat_columns)), ('encode-highd', EncodeCategorical()) ])), ('select-quantitative', ColumnSelector(q_columns, c_type='float')), ])), ('impute-missing', DfTransformerAdapter(Imputer(strategy='median'))), ('scale', DfTransformerAdapter(StandardScaler())) ])), ('categorical', Pipeline([ ('select-categorical', ColumnSelector(cat_columns)), ('apply-onehot', DfOneHot()), ('spread-binary', SpreadBinary()) ])), ])) ]) preprocessor_omni_20 = Pipeline([ ('features', DfFeatureUnion([ ('quantitative', Pipeline([ ('select-quantitative', ColumnSelector(q_columns, c_type='float')), ('impute-missing', DfTransformerAdapter(Imputer(strategy='median'))), ('scale', DfTransformerAdapter(StandardScaler())) ])), ('categorical', Pipeline([ ('select-categorical', ColumnSelector(cat_columns + high_dim_cat_columns)), ('apply-onehot', OmniEncoder(max_cols=20)), ('spread-binary', SpreadBinary()) ])), ])) ]) preprocessor_omni_50 = Pipeline([ ('features', DfFeatureUnion([ ('quantitative', Pipeline([ ('select-quantitative', ColumnSelector(q_columns, c_type='float')), ('impute-missing', DfTransformerAdapter(Imputer(strategy='median'))), ('scale', DfTransformerAdapter(StandardScaler())) ])), ('categorical', Pipeline([ ('select-categorical', ColumnSelector(cat_columns + high_dim_cat_columns)), ('apply-onehot', OmniEncoder(max_cols=50)), ('spread-binary', SpreadBinary()) ])), ])) ]) # ### Sampling to reduce runtime in training large dataset # # If we train models based on the entire test dataset provided it exhausts the memory on my laptop. Again, in the spirit of getting something quick and dirty working, we'll sample the dataset and train on that. We'll then evaluate our model by testing the accuracy on a larger sample. # In[17]: from sklearn.cross_validation import train_test_split training_frac = 0.01 test_frac = 0.05 training_data, the_rest = train_test_split(training_data_full, train_size=training_frac, random_state=0) test_data = the_rest.sample(frac=test_frac / (1-training_frac)) # In[18]: training_data.shape # In[19]: test_data.shape # ### Reporting utilities # # Some utilities to make reporting progress easier # In[21]: import time import subprocess class time_and_log(): def __init__(self, label, *, prefix='', say=False): self.label = label self.prefix = prefix self.say = say def __enter__(self): msg = 'Starting {}'.format(self.label) print('{}{}'.format(self.prefix, msg)) if self.say: cmd_say(msg) self.start = time.process_time() return self def __exit__(self, *exc): self.interval = time.process_time() - self.start msg = 'Finished {} in {:.2f} seconds'.format(self.label, self.interval) print('{}{}'.format(self.prefix, msg)) if self.say: cmd_say(msg) return False def cmd_say(msg): subprocess.call("say '{}'".format(msg), shell=True) # In[22]: with time_and_log('wrangling training data', say=True, prefix=" _"): wrangled = preprocessor_omni_20.fit_transform(training_data) # In[23]: wrangled.head() # ## Putting together classifiers # In[24]: from sklearn.ensemble import RandomForestClassifier pipe_rf_ignore = Pipeline([ ('wrangle', preprocessor_ignore), ('rf', RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)) ]) pipe_rf_lexico = Pipeline([ ('wrangle', preprocessor_lexico), ('rf', RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)) ]) pipe_rf_omni_20 = Pipeline([ ('wrangle', preprocessor_omni_20), ('rf', RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)) ]) pipe_rf_omni_50 = Pipeline([ ('wrangle', preprocessor_omni_50), ('rf', RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)) ]) # In[25]: feature_columns = cat_columns + q_columns + high_dim_cat_columns # In[26]: def extract_X_y(df): return df[feature_columns], df['outcome'] X_train, y_train = extract_X_y(training_data) X_test, y_test = extract_X_y(test_data) # ### Cross validation and full test set accuracy # # We'll cross validate within the training set, and then train on the full training set and see how well it performs on the full test set. # In[27]: from sklearn.metrics import accuracy_score from sklearn.cross_validation import cross_val_score import numpy as np models = [ ('random forest ignore', pipe_rf_ignore), ('random forest ordinal', pipe_rf_lexico), ('random forest omni 20', pipe_rf_omni_20), ('random forest omni 50', pipe_rf_omni_50), ] for label, model in models: print('Evaluating {}'.format(label)) cmd_say('Evaluating {}'.format(label)) # with time_and_log('cross validating', say=True, prefix=" _"): # scores = cross_val_score(estimator=model, # X=X_train, # y=y_train, # cv=5, # n_jobs=1) # print(' CV accuracy: {:.3f} +/- {:.3f}'.format(np.mean(scores), np.std(scores))) with time_and_log('fitting full training set', say=True, prefix=" _"): model.fit(X_train, y_train) with time_and_log('evaluating on full test set', say=True, prefix=" _"): print(" Full test accuracy ({:.2f} of dataset): {:.3f}".format( test_frac, accuracy_score(y_test, model.predict(X_test))))