Notebook

Neural averaging ensembles on benchml data¶

Dr. Michael Allgöwer, b.telligent, michael.allgoewer@btelligent.com

In [0]:

try:
  %tensorflow_version 2.x
except Exception:
  pass

TensorFlow 2.x selected.

In [0]:

import pandas as pd
import numpy as np
import tensorflow as tf
import os
from pathlib import Path
from collections import OrderedDict

In [0]:

print(tf.__version__, tf.keras.__version__)

2.0.0-rc2 2.2.4-tf

In [0]:

# Importing data, keeping it all together in a class being able to return either pandas dataframe or tf dataset
# tf datasets are the TensorFlow 2.0-native way of handling data.
class Flights():
    '''Flight delay classification data from Szilard Pafka's benchml; derived from the well-known fligts dataset'''

    def __init__(self):
        
        # you may want to change these paths, depending on where you put the files
        train_path = 'https://raw.githubusercontent.com/Allgoerithm/neuralaveraging/master/data/train-0.01m.csv'  
        test_path = 'https://raw.githubusercontent.com/Allgoerithm/neuralaveraging/master/data/test.csv'
        paths = {'train': train_path, 'test': test_path}
        slices = list(paths.keys())

        random_seed = 4711
        self.data = {}  # neural-network version of the data, with integer indices for categorial data
        self.data_1h = {}  # onehot-encoded version of the data, needed for gradient boosted trees

        for (data_slice, input_path) in paths.items():
            self.data[data_slice] = pd.read_csv(input_path, delimiter=',', quotechar='"', na_values=' ')
            self.data[data_slice]['slice'] = data_slice  # add new column with the slice the data belongs to
        data_complete = self.data['train'].append(self.data['test'])
        data_complete.rename(index=str, columns={'dep_delayed_15min': 'target'}, inplace=True)            

        # change binary target variable from Y/N to 0/1 (new datatype: int)
        all_replacements = {'target': {'Y': 1, 'N': 0}}
        data_complete.replace(all_replacements, inplace=True)
        data_complete_1h = data_complete
        
        # indexing all categorial columns (transform into successive integers)
        self.categorial_columns = [list(data_complete.columns)[i]
                            for i in range(len(data_complete.columns))
                            if list(data_complete.dtypes)[i] == np.dtype('object')]
        self.categorial_columns.remove('slice')
        self.index_lengths = OrderedDict()
        for column in self.categorial_columns:
            data_complete['catindex_' + column] = -1 + data_complete[column]\
                                                                .rank(method='dense', numeric_only=False)
            data_complete = data_complete.drop(columns=[column])
            self.index_lengths['catindex_' + column] = 1 + data_complete['catindex_' + column].max()

        # onehot-encoding for onehot-version of data 
        categorial_column_prefixes = ['onehot_' + name for name in self.categorial_columns]
        data_complete_1h = pd.get_dummies(data_complete_1h, columns=self.categorial_columns,
                                          prefix=categorial_column_prefixes, drop_first=True)        

        for (data_slice, input_path) in paths.items():
            self.data[data_slice] = data_complete[data_complete['slice'] == data_slice].drop(columns=['slice'])
            self.data_1h[data_slice] = data_complete_1h[data_complete_1h['slice'] == data_slice].drop(columns=['slice'])

        # standardize all columns except categorial columns and target variable 
        self.categorial_columns = ['catindex_' + col for col in self.categorial_columns]
        columns_to_standardize = [col for col in self.data['train'].columns 
                                      if col not in self.categorial_columns + ['target']]
        for feature_name in columns_to_standardize:
            # mean and variance equal of noncategorial columns are equal for data and data_1h
            mean = self.data['train'][feature_name].mean()  
            std = self.data['train'][feature_name].std()
            if std > 0:  # keep only colums with at least some variance
                for data_slice in slices:
                    self.data[data_slice][feature_name] = (self.data[data_slice][feature_name] - mean) / std
                    self.data_1h[data_slice][feature_name] = (self.data_1h[data_slice][feature_name] - mean) / std
            else:  # drop constant columns
                for data_slice in slices:
                    self.data[data_slice] = self.data[data_slice].drop(feature_name, axis=1)
                    self.data_1h[data_slice] = self.data_1h[data_slice].drop(feature_name, axis=1)

    def get_dataframe(self, data_slice: str, categorials_as_onehot: bool = False):
        assert data_slice in ('train', 'test', 'valid')
        result = self.data_1h[data_slice] if categorials_as_onehot else self.data[data_slice]
        return result

    def get_dataset(self, data_slice: str):
        assert data_slice in ('train', 'test', 'valid')
        target = self.data[data_slice]['target']
        predictor_cols = [c for c in self.data[data_slice].columns if c != 'target']
        predictors = self.data[data_slice][predictor_cols]
        dataset = tf.data.Dataset.from_tensor_slices((predictors.values, target.values))
        return dataset
    
    def get_index_lengths(self):
        return self.index_lengths

    def no_of_predictors(self):
        return len(self.data['train'].columns) - 1

In [0]:

# instantiate our new class and get test and training data
flights = Flights()
data_train_df = flights.get_dataframe(data_slice='train')
data_test_df = flights.get_dataframe(data_slice='test')

# now for XGboost, with one-hot encoded categorial variables
data_train_1h_df = flights.get_dataframe(data_slice='train', categorials_as_onehot=True)
data_test_1h_df = flights.get_dataframe(data_slice='test', categorials_as_onehot=True)

In [0]:

# fit gradient boosting model to the data as a baseline, to check we can reproduce Szilard Pafkas's findings

import xgboost as xgb
import numpy as np
import sklearn.metrics

d_train = xgb.DMatrix(data_train_1h_df.drop(columns=['target']), label=data_train_1h_df['target'])
d_test = xgb.DMatrix(data_test_1h_df.drop(columns=['target']), label=data_test_1h_df['target'])
param = {'objective':'binary:logistic', 'max_depth': 16, 'eta': 0.01, 'subsample': 0.5, 'min_obs_node': 1}

gb_model = xgb.train(params=param, dtrain=d_train, num_boost_round=1000)  

gb_pred_test = gb_model.predict(d_test)
gb_auc = sklearn.metrics.roc_auc_score(data_test_df['target'], gb_pred_test)
gb_mae = sklearn.metrics.mean_absolute_error(data_test_df['target'], gb_pred_test)
print(f'AUC:{gb_auc}, MAE:{gb_mae}')

In [0]:

import time
from pathlib import Path
root_logdir = Path('logs')

def get_log_dir() -> Path:
    run_id = Path(time.strftime('run_%Y_%m_%d-%H_%M_%S'))
    return root_logdir / run_id

Building a neural averaging ensemble¶

In [0]:

import sklearn.metrics                 # to compute AUC
import datetime

from functools import partial          # higher-order-function for currying

# We use the functional API here which is almost as simple to use as a sequential model,
# and versatile enough for our needs. To keep things tidy, we place the model definition inside a function.
# If things get more complicated (especially dynamic nets), the subclassing API is needed.

def averaging_ensemble(inputs_numeric: int, inputs_for_embedding: int, embedding_input_dims: list, 
                     embedding_output_dims: list, width: int, weak_learners: int, activation_name: 
                     str = 'tanh', share_embedding_layer: bool = False, sigmoid_layer: bool = True, 
                     averaging_layer: bool = True):
    r'''Return a generic dense network model

    inputs_numeric: number of numeric columns (features) in the input data set; these are expected to be the first 
    columns
    inputs_for_embedding: integer columns of input set to be transformed by embeddings
    embedding_input_dims: input dimension (size of the vocabulary) for each column to be transformed by an embedding;
    this is supposed to be a list of length inputs_for_embedding
    embedding_output_dims: output dimensions for each column to be transformed by an embedding;
    this is supposed to be a list of length inputs_for_embedding
    width: number of neurons in the hidden layer of each weak learner
    weak_learners: number of weak learners in the ensemble
    activation_name: string choosing the activation function for the hidden layers,
                    'tanh' for tanh activation,
                    'relu' for ReLU activation,
                    'selu' for SELU activation
    sigmoid_layer: switches sigmoid layer on and off as last layer for each weak learner. The layer is usually needed, 
    it is only switched off for hidden layer size checking 
    averaging_layer: switches last averaging layer on and off
    '''
    assert width >= 1, 'width is required to be at least 1'
    assert weak_learners >= 1, 'weak_learners is required to be at least 1'
    assert activation_name.lower() in ['tanh', 'relu', 'selu'], \
        f'Unknown value "{activation_name}" for activation_fct. Options are "tanh", "relu" and "selu".'
    assert len(embedding_input_dims) == inputs_for_embedding, \
        'length of list embedding_input_dims is supposed to be equal to inputs_for_embedding'
    assert len(embedding_output_dims) == inputs_for_embedding, \
        'length of list embedding_output_dims is supposed to be equal to inputs_for_embedding'

    if activation_name.lower() == 'tanh':
        activation = tf.keras.activations.tanh
        kernel_initializer = tf.initializers.GlorotUniform()
    elif activation_name.lower() == 'relu':
        activation = tf.keras.activations.relu
        kernel_initializer = tf.initializers.GlorotUniform()        
    else:
        activation = tf.keras.activations.selu
        kernel_initializer = tf.initializers.VarianceScaling(scale=1.0, mode='fan_in')

    input_layer = tf.keras.Input(shape=(inputs_numeric + inputs_for_embedding,))
    split_input_layer = tf.split(input_layer, [inputs_numeric] + [1]*inputs_for_embedding, axis=1)

    hidden = []
    name_hidden = 'hidden' if weak_learners==1 else None
    # add hidden layer as a list of weak learners
    for i in range(weak_learners):
        if i == 0 or not(share_embedding_layer):
            embedded_input_components = [split_input_layer[0]]  # use numerical inputs without transformation
            # embedd the other components
            for j in range(inputs_for_embedding):
                prefix = '' if share_embedding_layer else f'wl_{i}_'
                embedding_layer_name = prefix + f'emb_{j}_in{embedding_input_dims[j]}_out{embedding_output_dims[j]}'
                embedded_input = tf.keras.layers.Embedding(input_dim=embedding_input_dims[j], 
                                                        output_dim=embedding_output_dims[j], input_length=1, 
                                                        name=embedding_layer_name)(split_input_layer[1 + j])
                embedded_input_components.append(tf.keras.layers.Flatten()(embedded_input))
            embedded_input = tf.keras.layers.Concatenate(axis=1)(embedded_input_components)

        # create flat dense layer and sigmoid layer for classification 
        weak_learner = tf.keras.layers.Dense(units=width, activation=activation, kernel_initializer=kernel_initializer,
                                             name=name_hidden)(embedded_input)
        weak_learner = tf.keras.layers.Dense(units=1, activation=tf.keras.activations.sigmoid)(weak_learner)
        hidden.append(weak_learner)

    if weak_learners > 1 and averaging_layer:  
        output_layer = tf.keras.layers.Average()(hidden)  # add an averaging layer at the end
    elif weak_learners > 1:  # if we have multiple outputs and no averaging layer, we return them all
        output_layer = hidden
    else:
        output_layer = weak_learner  # if there's only one weak learner, we use it as output directly
    
    return (input_layer, output_layer)      

first, determine layer size¶

In [0]:

#this function is needed as a helper below
def compute_correlation_histogram(mat: np.array):
    '''Computes the correlations of the columns of mat and returns a histogram (counts for each binned correlation value)
    '''
    corrmatrix_raw = pd.DataFrame(data=mat, 
                                  columns=[f'n_{i:02}' for i in range(mat.shape[1])])\
                                 .corr(method="spearman").abs()  # we discard the sign of the correlations
    corrmatrix = corrmatrix_raw.stack().reset_index()
    corrmatrix.rename(index=str, columns={"level_0": "variable_1", "level_1": "variable_2", 0: "correlation"},
                    inplace=True)  # set meaningful variable names
    correlations = corrmatrix[corrmatrix['variable_1'] > corrmatrix['variable_2']] # keep only upper triangular entries
    correlations = correlations.reset_index()
    
    return correlations.iloc[correlations['correlation'].idxmax(axis='rows')]  # return row with maximum correlation

In [0]:

output_dimensions = OrderedDict([('catindex_Month', 2), ('catindex_DayofMonth', 2), ('catindex_DayOfWeek', 2),
                                ('catindex_UniqueCarrier', 5),  ('catindex_Origin', 5),  ('catindex_Dest', 5)])
input_dims = OrderedDict([('catindex_Month', 13),
            ('catindex_DayofMonth', 32),
            ('catindex_DayOfWeek', 8),
            ('catindex_UniqueCarrier', 23),
            ('catindex_Origin', 305),
            ('catindex_Dest', 305)])

In [0]:

# Check the size of the hidden layer: Train a model with a single weak learner
import datetime
tf.random.set_seed(3141592653)  # set a fixed (arbitrary) seed for TensorFlow's random numbers, global level
np.random.seed(seed=3141592653)  # ...and do the same for numpy's random numbers

for x in range(10):
    now=datetime.datetime.now()

    log_dir = get_log_dir()
    model_name = 'benchml100k_Layersize_check'
    activation_name ='tanh'
    weak_learners = 1  # when we check the size, we do not use averaging

    # We go for a low batch size (slow, but less prone to overfitting).
    # The learning rate has been chosen by some quick trials (going down from 1 by dividing by 10 in each step until
    # learning is sufficiently stable).
    # We combine that with a low number of epochs as we only need a rough estimation to gauge the correlations.
    learning_rate = 0.1
    batch_size = 10
    epochs = 20
    widths = []
    max_correlations = []
    validation_data = (data_test_df[[c for c in data_test_df.columns if c != 'target']].values, 
                    data_test_df['target'].values)

    for width in (10, 20, 30, 40, 50, 60, 80, 100, 120, 140):
        (inputs, outputs) = averaging_ensemble(inputs_numeric=flights.no_of_predictors() - len(flights.get_index_lengths()), 
                                            inputs_for_embedding=len(flights.get_index_lengths()), 
                                            embedding_input_dims=list(input_dims.values()),  # list(flights.get_index_lengths().values()), 
                                            embedding_output_dims=list(output_dimensions.values()),
                                            width=width, weak_learners=weak_learners, activation_name=activation_name)
        model = tf.keras.Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss='binary_crossentropy', 
                    metrics=['mae', 'AUC'])
        model.fit(data_train_df[[c for c in data_train_df.columns if c != 'target']].values, data_train_df['target'].values, 
                epochs=epochs, batch_size=batch_size, verbose=0)
        
        # shave the model, i.e., delete the last layer
        layer_name = 'hidden'
        shaved_model = tf.keras.Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
        hidden_layer_output = shaved_model.predict(validation_data)
        max_correlation = compute_correlation_histogram(hidden_layer_output)
        widths.append(width)
        max_correlations.append(max_correlation)
        print(f"{widths[-1]}: {max_correlation['correlation']}")
        if max_correlation['correlation'] >= 0.98:
            break

10: 0.8881912545780639
20: 0.9451014211983376
30: 0.9685687572680682
40: 0.947551540745585
50: 0.9645273599570406
60: 0.9763770528780563
80: 0.9572017647929024
100: 0.9846774011987006
10: 0.9419475646449638
20: 0.958465299219847
30: 0.9711734558646341
40: 0.9801705036474484
10: 0.7366949374359877
20: 0.8926622584310067
30: 0.9508609708677088
40: 0.9760320661499896
50: 0.9605671215074669
60: 0.9374681472720487
80: 0.9806870175219736
10: 0.8780105916016454
20: 0.9002817446860963
30: 0.9599076665318469
40: 0.9701751239117474
50: 0.9358039686850287
60: 0.93732918693722
80: 0.9835738119950854
10: 0.7920294754857148

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-15-305516d9231c> in <module>()
     33                     metrics=['mae', 'AUC'])
     34         model.fit(data_train_df[[c for c in data_train_df.columns if c != 'target']].values, data_train_df['target'].values, 
---> 35                 epochs=epochs, batch_size=batch_size, verbose=0)
     36 
     37         # shave the model, i.e., delete the last layer

/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    726         max_queue_size=max_queue_size,
    727         workers=workers,
--> 728         use_multiprocessing=use_multiprocessing)
    729 
    730   def evaluate(self,

/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
    322                 mode=ModeKeys.TRAIN,
    323                 training_context=training_context,
--> 324                 total_epochs=epochs)
    325             cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
    326 

/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/keras/engine/training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
    121         step=step, mode=mode, size=current_batch_size) as batch_logs:
    122       try:
--> 123         batch_outs = execution_function(iterator)
    124       except (StopIteration, errors.OutOfRangeError):
    125         # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?

/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/keras/engine/training_v2_utils.py in execution_function(input_fn)
     84     # `numpy` translates Tensors to values in Eager mode.
     85     return nest.map_structure(_non_none_constant_value,
---> 86                               distributed_function(input_fn))
     87 
     88   return execution_function

/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/def_function.py in __call__(self, *args, **kwds)
    455 
    456     tracing_count = self._get_tracing_count()
--> 457     result = self._call(*args, **kwds)
    458     if tracing_count == self._get_tracing_count():
    459       self._call_counter.called_without_tracing()

/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/def_function.py in _call(self, *args, **kwds)
    485       # In this case we have created variables on the first call, so we run the
    486       # defunned version which is guaranteed to never create variables.
--> 487       return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
    488     elif self._stateful_fn is not None:
    489       # Release the lock early so that multiple threads can perform the call

/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/function.py in __call__(self, *args, **kwargs)
   1821     """Calls a graph function specialized to the inputs."""
   1822     graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 1823     return graph_function._filtered_call(args, kwargs)  # pylint: disable=protected-access
   1824 
   1825   @property

/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/function.py in _filtered_call(self, args, kwargs)
   1139          if isinstance(t, (ops.Tensor,
   1140                            resource_variable_ops.BaseResourceVariable))),
-> 1141         self.captured_inputs)
   1142 
   1143   def _call_flat(self, args, captured_inputs, cancellation_manager=None):

/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
   1222     if executing_eagerly:
   1223       flat_outputs = forward_function.call(
-> 1224           ctx, args, cancellation_manager=cancellation_manager)
   1225     else:
   1226       gradient_name = self._delayed_rewrite_functions.register()

/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/function.py in call(self, ctx, args, cancellation_manager)
    509               inputs=args,
    510               attrs=("executor_type", executor_type, "config_proto", config),
--> 511               ctx=ctx)
    512         else:
    513           outputs = execute.execute_with_cancellation(

/tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     59     tensors = pywrap_tensorflow.TFE_Py_Execute(ctx._handle, device_name,
     60                                                op_name, inputs, attrs,
---> 61                                                num_outputs)
     62   except core._NotOkStatusException as e:
     63     if name is not None:

KeyboardInterrupt:

In [0]:

# Now train the model, with the weak learner size we just determined: From the output of the last cell we estimate
# that width 80 will probably suffice.
import datetime
now=datetime.datetime.now()

tf.random.set_seed(3141592653)  # set a fixed (arbitrary) seed for TensorFlow's random numbers, global level
np.random.seed(seed=3141592653)  # ...and do the same for numpy's random numbers

model_name = 'benchml10k'
activation_name ='tanh'
weak_learners = 100  # 100 is good for a final model

# We go for a low batch size (slow, but less prone to overfitting).
# The learning rate has been chosen by some quick trials (going down from 1 by dividing by 10 in each step until
# learning is sufficiently stable).
# We combine that with a low number of epochs as we only need a rough estimation to gauge the correlations.
learning_rate = 1
batch_size = 10
epochs = 5
width = 80

(inputs, outputs) = averaging_ensemble(inputs_numeric=flights.no_of_predictors() - len(flights.get_index_lengths()), 
                                            inputs_for_embedding=len(flights.get_index_lengths()), 
                                            embedding_input_dims=list(input_dims.values()),  # list(flights.get_index_lengths().values()), 
                                            embedding_output_dims=list(output_dimensions.values()),
                                            width=width, weak_learners=weak_learners, activation_name=activation_name,
                                            share_embedding_layer=True)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
validation_data = (data_test_df.drop(columns=['target']).values, 
                   data_test_df['target'].values)
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss='binary_crossentropy', 
            metrics=['mae', 'AUC'])
model.fit(data_train_df[[c for c in data_train_df.columns if c != 'target']].values, data_train_df['target'].values, 
        epochs=epochs, batch_size=batch_size, validation_data=validation_data) 

Train on 10000 samples, validate on 100000 samples
Epoch 1/5
10000/10000 [==============================] - 173s 17ms/sample - loss: 0.4777 - mae: 0.3229 - AUC: 0.6430 - val_loss: 0.4889 - val_mae: 0.3036 - val_AUC: 0.6842
Epoch 2/5
10000/10000 [==============================] - 168s 17ms/sample - loss: 0.4518 - mae: 0.2883 - AUC: 0.6952 - val_loss: 0.4874 - val_mae: 0.2984 - val_AUC: 0.6884
Epoch 3/5
10000/10000 [==============================] - 167s 17ms/sample - loss: 0.4470 - mae: 0.2835 - AUC: 0.7048 - val_loss: 0.4836 - val_mae: 0.3005 - val_AUC: 0.6931
Epoch 4/5
10000/10000 [==============================] - 166s 17ms/sample - loss: 0.4436 - mae: 0.2826 - AUC: 0.7113 - val_loss: 0.4829 - val_mae: 0.2977 - val_AUC: 0.6959
Epoch 5/5
10000/10000 [==============================] - 157s 16ms/sample - loss: 0.4405 - mae: 0.2788 - AUC: 0.7176 - val_loss: 0.4836 - val_mae: 0.3009 - val_AUC: 0.6927

Out[0]:

<tensorflow.python.keras.callbacks.History at 0x7f96bc9ce2b0>

In [0]: