Dr. Michael Allgöwer, b.telligent, michael.allgoewer@btelligent.com
try:
%tensorflow_version 2.x
except Exception:
pass
TensorFlow 2.x selected.
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from pathlib import Path
from collections import OrderedDict
print(tf.__version__, tf.keras.__version__)
2.0.0-rc2 2.2.4-tf
# Importing data, keeping it all together in a class being able to return either pandas dataframe or tf dataset
# tf datasets are the TensorFlow 2.0-native way of handling data.
class Flights():
'''Flight delay classification data from Szilard Pafka's benchml; derived from the well-known fligts dataset'''
def __init__(self):
# you may want to change these paths, depending on where you put the files
train_path = 'https://raw.githubusercontent.com/Allgoerithm/neuralaveraging/master/data/train-0.01m.csv'
test_path = 'https://raw.githubusercontent.com/Allgoerithm/neuralaveraging/master/data/test.csv'
paths = {'train': train_path, 'test': test_path}
slices = list(paths.keys())
random_seed = 4711
self.data = {} # neural-network version of the data, with integer indices for categorial data
self.data_1h = {} # onehot-encoded version of the data, needed for gradient boosted trees
for (data_slice, input_path) in paths.items():
self.data[data_slice] = pd.read_csv(input_path, delimiter=',', quotechar='"', na_values=' ')
self.data[data_slice]['slice'] = data_slice # add new column with the slice the data belongs to
data_complete = self.data['train'].append(self.data['test'])
data_complete.rename(index=str, columns={'dep_delayed_15min': 'target'}, inplace=True)
# change binary target variable from Y/N to 0/1 (new datatype: int)
all_replacements = {'target': {'Y': 1, 'N': 0}}
data_complete.replace(all_replacements, inplace=True)
data_complete_1h = data_complete
# indexing all categorial columns (transform into successive integers)
self.categorial_columns = [list(data_complete.columns)[i]
for i in range(len(data_complete.columns))
if list(data_complete.dtypes)[i] == np.dtype('object')]
self.categorial_columns.remove('slice')
self.index_lengths = OrderedDict()
for column in self.categorial_columns:
data_complete['catindex_' + column] = -1 + data_complete[column]\
.rank(method='dense', numeric_only=False)
data_complete = data_complete.drop(columns=[column])
self.index_lengths['catindex_' + column] = 1 + data_complete['catindex_' + column].max()
# onehot-encoding for onehot-version of data
categorial_column_prefixes = ['onehot_' + name for name in self.categorial_columns]
data_complete_1h = pd.get_dummies(data_complete_1h, columns=self.categorial_columns,
prefix=categorial_column_prefixes, drop_first=True)
for (data_slice, input_path) in paths.items():
self.data[data_slice] = data_complete[data_complete['slice'] == data_slice].drop(columns=['slice'])
self.data_1h[data_slice] = data_complete_1h[data_complete_1h['slice'] == data_slice].drop(columns=['slice'])
# standardize all columns except categorial columns and target variable
self.categorial_columns = ['catindex_' + col for col in self.categorial_columns]
columns_to_standardize = [col for col in self.data['train'].columns
if col not in self.categorial_columns + ['target']]
for feature_name in columns_to_standardize:
# mean and variance equal of noncategorial columns are equal for data and data_1h
mean = self.data['train'][feature_name].mean()
std = self.data['train'][feature_name].std()
if std > 0: # keep only colums with at least some variance
for data_slice in slices:
self.data[data_slice][feature_name] = (self.data[data_slice][feature_name] - mean) / std
self.data_1h[data_slice][feature_name] = (self.data_1h[data_slice][feature_name] - mean) / std
else: # drop constant columns
for data_slice in slices:
self.data[data_slice] = self.data[data_slice].drop(feature_name, axis=1)
self.data_1h[data_slice] = self.data_1h[data_slice].drop(feature_name, axis=1)
def get_dataframe(self, data_slice: str, categorials_as_onehot: bool = False):
assert data_slice in ('train', 'test', 'valid')
result = self.data_1h[data_slice] if categorials_as_onehot else self.data[data_slice]
return result
def get_dataset(self, data_slice: str):
assert data_slice in ('train', 'test', 'valid')
target = self.data[data_slice]['target']
predictor_cols = [c for c in self.data[data_slice].columns if c != 'target']
predictors = self.data[data_slice][predictor_cols]
dataset = tf.data.Dataset.from_tensor_slices((predictors.values, target.values))
return dataset
def get_index_lengths(self):
return self.index_lengths
def no_of_predictors(self):
return len(self.data['train'].columns) - 1
# instantiate our new class and get test and training data
flights = Flights()
data_train_df = flights.get_dataframe(data_slice='train')
data_test_df = flights.get_dataframe(data_slice='test')
# now for XGboost, with one-hot encoded categorial variables
data_train_1h_df = flights.get_dataframe(data_slice='train', categorials_as_onehot=True)
data_test_1h_df = flights.get_dataframe(data_slice='test', categorials_as_onehot=True)
# fit gradient boosting model to the data as a baseline, to check we can reproduce Szilard Pafkas's findings
import xgboost as xgb
import numpy as np
import sklearn.metrics
d_train = xgb.DMatrix(data_train_1h_df.drop(columns=['target']), label=data_train_1h_df['target'])
d_test = xgb.DMatrix(data_test_1h_df.drop(columns=['target']), label=data_test_1h_df['target'])
param = {'objective':'binary:logistic', 'max_depth': 16, 'eta': 0.01, 'subsample': 0.5, 'min_obs_node': 1}
gb_model = xgb.train(params=param, dtrain=d_train, num_boost_round=1000)
gb_pred_test = gb_model.predict(d_test)
gb_auc = sklearn.metrics.roc_auc_score(data_test_df['target'], gb_pred_test)
gb_mae = sklearn.metrics.mean_absolute_error(data_test_df['target'], gb_pred_test)
print(f'AUC:{gb_auc}, MAE:{gb_mae}')
import time
from pathlib import Path
root_logdir = Path('logs')
def get_log_dir() -> Path:
run_id = Path(time.strftime('run_%Y_%m_%d-%H_%M_%S'))
return root_logdir / run_id
import sklearn.metrics # to compute AUC
import datetime
from functools import partial # higher-order-function for currying
# We use the functional API here which is almost as simple to use as a sequential model,
# and versatile enough for our needs. To keep things tidy, we place the model definition inside a function.
# If things get more complicated (especially dynamic nets), the subclassing API is needed.
def averaging_ensemble(inputs_numeric: int, inputs_for_embedding: int, embedding_input_dims: list,
embedding_output_dims: list, width: int, weak_learners: int, activation_name:
str = 'tanh', share_embedding_layer: bool = False, sigmoid_layer: bool = True,
averaging_layer: bool = True):
r'''Return a generic dense network model
inputs_numeric: number of numeric columns (features) in the input data set; these are expected to be the first
columns
inputs_for_embedding: integer columns of input set to be transformed by embeddings
embedding_input_dims: input dimension (size of the vocabulary) for each column to be transformed by an embedding;
this is supposed to be a list of length inputs_for_embedding
embedding_output_dims: output dimensions for each column to be transformed by an embedding;
this is supposed to be a list of length inputs_for_embedding
width: number of neurons in the hidden layer of each weak learner
weak_learners: number of weak learners in the ensemble
activation_name: string choosing the activation function for the hidden layers,
'tanh' for tanh activation,
'relu' for ReLU activation,
'selu' for SELU activation
sigmoid_layer: switches sigmoid layer on and off as last layer for each weak learner. The layer is usually needed,
it is only switched off for hidden layer size checking
averaging_layer: switches last averaging layer on and off
'''
assert width >= 1, 'width is required to be at least 1'
assert weak_learners >= 1, 'weak_learners is required to be at least 1'
assert activation_name.lower() in ['tanh', 'relu', 'selu'], \
f'Unknown value "{activation_name}" for activation_fct. Options are "tanh", "relu" and "selu".'
assert len(embedding_input_dims) == inputs_for_embedding, \
'length of list embedding_input_dims is supposed to be equal to inputs_for_embedding'
assert len(embedding_output_dims) == inputs_for_embedding, \
'length of list embedding_output_dims is supposed to be equal to inputs_for_embedding'
if activation_name.lower() == 'tanh':
activation = tf.keras.activations.tanh
kernel_initializer = tf.initializers.GlorotUniform()
elif activation_name.lower() == 'relu':
activation = tf.keras.activations.relu
kernel_initializer = tf.initializers.GlorotUniform()
else:
activation = tf.keras.activations.selu
kernel_initializer = tf.initializers.VarianceScaling(scale=1.0, mode='fan_in')
input_layer = tf.keras.Input(shape=(inputs_numeric + inputs_for_embedding,))
split_input_layer = tf.split(input_layer, [inputs_numeric] + [1]*inputs_for_embedding, axis=1)
hidden = []
name_hidden = 'hidden' if weak_learners==1 else None
# add hidden layer as a list of weak learners
for i in range(weak_learners):
if i == 0 or not(share_embedding_layer):
embedded_input_components = [split_input_layer[0]] # use numerical inputs without transformation
# embedd the other components
for j in range(inputs_for_embedding):
prefix = '' if share_embedding_layer else f'wl_{i}_'
embedding_layer_name = prefix + f'emb_{j}_in{embedding_input_dims[j]}_out{embedding_output_dims[j]}'
embedded_input = tf.keras.layers.Embedding(input_dim=embedding_input_dims[j],
output_dim=embedding_output_dims[j], input_length=1,
name=embedding_layer_name)(split_input_layer[1 + j])
embedded_input_components.append(tf.keras.layers.Flatten()(embedded_input))
embedded_input = tf.keras.layers.Concatenate(axis=1)(embedded_input_components)
# create flat dense layer and sigmoid layer for classification
weak_learner = tf.keras.layers.Dense(units=width, activation=activation, kernel_initializer=kernel_initializer,
name=name_hidden)(embedded_input)
weak_learner = tf.keras.layers.Dense(units=1, activation=tf.keras.activations.sigmoid)(weak_learner)
hidden.append(weak_learner)
if weak_learners > 1 and averaging_layer:
output_layer = tf.keras.layers.Average()(hidden) # add an averaging layer at the end
elif weak_learners > 1: # if we have multiple outputs and no averaging layer, we return them all
output_layer = hidden
else:
output_layer = weak_learner # if there's only one weak learner, we use it as output directly
return (input_layer, output_layer)
#this function is needed as a helper below
def compute_correlation_histogram(mat: np.array):
'''Computes the correlations of the columns of mat and returns a histogram (counts for each binned correlation value)
'''
corrmatrix_raw = pd.DataFrame(data=mat,
columns=[f'n_{i:02}' for i in range(mat.shape[1])])\
.corr(method="spearman").abs() # we discard the sign of the correlations
corrmatrix = corrmatrix_raw.stack().reset_index()
corrmatrix.rename(index=str, columns={"level_0": "variable_1", "level_1": "variable_2", 0: "correlation"},
inplace=True) # set meaningful variable names
correlations = corrmatrix[corrmatrix['variable_1'] > corrmatrix['variable_2']] # keep only upper triangular entries
correlations = correlations.reset_index()
return correlations.iloc[correlations['correlation'].idxmax(axis='rows')] # return row with maximum correlation
output_dimensions = OrderedDict([('catindex_Month', 2), ('catindex_DayofMonth', 2), ('catindex_DayOfWeek', 2),
('catindex_UniqueCarrier', 5), ('catindex_Origin', 5), ('catindex_Dest', 5)])
input_dims = OrderedDict([('catindex_Month', 13),
('catindex_DayofMonth', 32),
('catindex_DayOfWeek', 8),
('catindex_UniqueCarrier', 23),
('catindex_Origin', 305),
('catindex_Dest', 305)])
# Check the size of the hidden layer: Train a model with a single weak learner
import datetime
tf.random.set_seed(3141592653) # set a fixed (arbitrary) seed for TensorFlow's random numbers, global level
np.random.seed(seed=3141592653) # ...and do the same for numpy's random numbers
for x in range(10):
now=datetime.datetime.now()
log_dir = get_log_dir()
model_name = 'benchml100k_Layersize_check'
activation_name ='tanh'
weak_learners = 1 # when we check the size, we do not use averaging
# We go for a low batch size (slow, but less prone to overfitting).
# The learning rate has been chosen by some quick trials (going down from 1 by dividing by 10 in each step until
# learning is sufficiently stable).
# We combine that with a low number of epochs as we only need a rough estimation to gauge the correlations.
learning_rate = 0.1
batch_size = 10
epochs = 20
widths = []
max_correlations = []
validation_data = (data_test_df[[c for c in data_test_df.columns if c != 'target']].values,
data_test_df['target'].values)
for width in (10, 20, 30, 40, 50, 60, 80, 100, 120, 140):
(inputs, outputs) = averaging_ensemble(inputs_numeric=flights.no_of_predictors() - len(flights.get_index_lengths()),
inputs_for_embedding=len(flights.get_index_lengths()),
embedding_input_dims=list(input_dims.values()), # list(flights.get_index_lengths().values()),
embedding_output_dims=list(output_dimensions.values()),
width=width, weak_learners=weak_learners, activation_name=activation_name)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss='binary_crossentropy',
metrics=['mae', 'AUC'])
model.fit(data_train_df[[c for c in data_train_df.columns if c != 'target']].values, data_train_df['target'].values,
epochs=epochs, batch_size=batch_size, verbose=0)
# shave the model, i.e., delete the last layer
layer_name = 'hidden'
shaved_model = tf.keras.Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
hidden_layer_output = shaved_model.predict(validation_data)
max_correlation = compute_correlation_histogram(hidden_layer_output)
widths.append(width)
max_correlations.append(max_correlation)
print(f"{widths[-1]}: {max_correlation['correlation']}")
if max_correlation['correlation'] >= 0.98:
break
10: 0.8881912545780639 20: 0.9451014211983376 30: 0.9685687572680682 40: 0.947551540745585 50: 0.9645273599570406 60: 0.9763770528780563 80: 0.9572017647929024 100: 0.9846774011987006 10: 0.9419475646449638 20: 0.958465299219847 30: 0.9711734558646341 40: 0.9801705036474484 10: 0.7366949374359877 20: 0.8926622584310067 30: 0.9508609708677088 40: 0.9760320661499896 50: 0.9605671215074669 60: 0.9374681472720487 80: 0.9806870175219736 10: 0.8780105916016454 20: 0.9002817446860963 30: 0.9599076665318469 40: 0.9701751239117474 50: 0.9358039686850287 60: 0.93732918693722 80: 0.9835738119950854 10: 0.7920294754857148
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-15-305516d9231c> in <module>() 33 metrics=['mae', 'AUC']) 34 model.fit(data_train_df[[c for c in data_train_df.columns if c != 'target']].values, data_train_df['target'].values, ---> 35 epochs=epochs, batch_size=batch_size, verbose=0) 36 37 # shave the model, i.e., delete the last layer /tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs) 726 max_queue_size=max_queue_size, 727 workers=workers, --> 728 use_multiprocessing=use_multiprocessing) 729 730 def evaluate(self, /tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs) 322 mode=ModeKeys.TRAIN, 323 training_context=training_context, --> 324 total_epochs=epochs) 325 cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN) 326 /tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/keras/engine/training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs) 121 step=step, mode=mode, size=current_batch_size) as batch_logs: 122 try: --> 123 batch_outs = execution_function(iterator) 124 except (StopIteration, errors.OutOfRangeError): 125 # TODO(kaftan): File bug about tf function and errors.OutOfRangeError? /tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/keras/engine/training_v2_utils.py in execution_function(input_fn) 84 # `numpy` translates Tensors to values in Eager mode. 85 return nest.map_structure(_non_none_constant_value, ---> 86 distributed_function(input_fn)) 87 88 return execution_function /tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/def_function.py in __call__(self, *args, **kwds) 455 456 tracing_count = self._get_tracing_count() --> 457 result = self._call(*args, **kwds) 458 if tracing_count == self._get_tracing_count(): 459 self._call_counter.called_without_tracing() /tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/def_function.py in _call(self, *args, **kwds) 485 # In this case we have created variables on the first call, so we run the 486 # defunned version which is guaranteed to never create variables. --> 487 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable 488 elif self._stateful_fn is not None: 489 # Release the lock early so that multiple threads can perform the call /tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/function.py in __call__(self, *args, **kwargs) 1821 """Calls a graph function specialized to the inputs.""" 1822 graph_function, args, kwargs = self._maybe_define_function(args, kwargs) -> 1823 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access 1824 1825 @property /tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/function.py in _filtered_call(self, args, kwargs) 1139 if isinstance(t, (ops.Tensor, 1140 resource_variable_ops.BaseResourceVariable))), -> 1141 self.captured_inputs) 1142 1143 def _call_flat(self, args, captured_inputs, cancellation_manager=None): /tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager) 1222 if executing_eagerly: 1223 flat_outputs = forward_function.call( -> 1224 ctx, args, cancellation_manager=cancellation_manager) 1225 else: 1226 gradient_name = self._delayed_rewrite_functions.register() /tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/function.py in call(self, ctx, args, cancellation_manager) 509 inputs=args, 510 attrs=("executor_type", executor_type, "config_proto", config), --> 511 ctx=ctx) 512 else: 513 outputs = execute.execute_with_cancellation( /tensorflow-2.0.0-rc2/python3.6/tensorflow_core/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name) 59 tensors = pywrap_tensorflow.TFE_Py_Execute(ctx._handle, device_name, 60 op_name, inputs, attrs, ---> 61 num_outputs) 62 except core._NotOkStatusException as e: 63 if name is not None: KeyboardInterrupt:
# Now train the model, with the weak learner size we just determined: From the output of the last cell we estimate
# that width 80 will probably suffice.
import datetime
now=datetime.datetime.now()
tf.random.set_seed(3141592653) # set a fixed (arbitrary) seed for TensorFlow's random numbers, global level
np.random.seed(seed=3141592653) # ...and do the same for numpy's random numbers
model_name = 'benchml10k'
activation_name ='tanh'
weak_learners = 100 # 100 is good for a final model
# We go for a low batch size (slow, but less prone to overfitting).
# The learning rate has been chosen by some quick trials (going down from 1 by dividing by 10 in each step until
# learning is sufficiently stable).
# We combine that with a low number of epochs as we only need a rough estimation to gauge the correlations.
learning_rate = 1
batch_size = 10
epochs = 5
width = 80
(inputs, outputs) = averaging_ensemble(inputs_numeric=flights.no_of_predictors() - len(flights.get_index_lengths()),
inputs_for_embedding=len(flights.get_index_lengths()),
embedding_input_dims=list(input_dims.values()), # list(flights.get_index_lengths().values()),
embedding_output_dims=list(output_dimensions.values()),
width=width, weak_learners=weak_learners, activation_name=activation_name,
share_embedding_layer=True)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
validation_data = (data_test_df.drop(columns=['target']).values,
data_test_df['target'].values)
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss='binary_crossentropy',
metrics=['mae', 'AUC'])
model.fit(data_train_df[[c for c in data_train_df.columns if c != 'target']].values, data_train_df['target'].values,
epochs=epochs, batch_size=batch_size, validation_data=validation_data)
Train on 10000 samples, validate on 100000 samples Epoch 1/5 10000/10000 [==============================] - 173s 17ms/sample - loss: 0.4777 - mae: 0.3229 - AUC: 0.6430 - val_loss: 0.4889 - val_mae: 0.3036 - val_AUC: 0.6842 Epoch 2/5 10000/10000 [==============================] - 168s 17ms/sample - loss: 0.4518 - mae: 0.2883 - AUC: 0.6952 - val_loss: 0.4874 - val_mae: 0.2984 - val_AUC: 0.6884 Epoch 3/5 10000/10000 [==============================] - 167s 17ms/sample - loss: 0.4470 - mae: 0.2835 - AUC: 0.7048 - val_loss: 0.4836 - val_mae: 0.3005 - val_AUC: 0.6931 Epoch 4/5 10000/10000 [==============================] - 166s 17ms/sample - loss: 0.4436 - mae: 0.2826 - AUC: 0.7113 - val_loss: 0.4829 - val_mae: 0.2977 - val_AUC: 0.6959 Epoch 5/5 10000/10000 [==============================] - 157s 16ms/sample - loss: 0.4405 - mae: 0.2788 - AUC: 0.7176 - val_loss: 0.4836 - val_mae: 0.3009 - val_AUC: 0.6927
<tensorflow.python.keras.callbacks.History at 0x7f96bc9ce2b0>