In TF 2.3, Keras adds new preprocessing layers for image, text and strucured data. The following notebook explores those new layers for dealing with Structured data.
For a complete example of how to use the new preprocessing layer for Structured data check the Keras example - link.
#hide
%%bash
pip install -q tf-nightly
#hide
import pandas as pd
import tensorflow as tf
print('TF version: ', tf.__version__)
TF version: 2.4.0-dev20200802
Generate some random data for playing with and seeing what is the output of the preprocessing layers.
xdf = pd.DataFrame({
'categorical_string': ['LOW', 'HIGH', 'HIGH', 'MEDIUM'],
'categorical_integer_1': [1, 0, 1, 0],
'categorical_integer_2': [1, 2, 3, 4],
'numerical_1': [2.3, 0.2, 1.9, 5.8],
'numerical_2': [16, 32, 8, 60]
})
ydf = pd.DataFrame({'target': [0, 0, 0, 1]})
ds = tf.data.Dataset.from_tensor_slices((dict(xdf), ydf))
for x, y in ds.take(1):
print('X:', x)
print('y:', y)
X: {'categorical_string': <tf.Tensor: shape=(), dtype=string, numpy=b'cat1'>, 'categorical_integer_1': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'categorical_integer_2': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'numerical_1': <tf.Tensor: shape=(), dtype=float64, numpy=2.3>, 'numerical_2': <tf.Tensor: shape=(), dtype=int64, numpy=16>} y: tf.Tensor([0], shape=(1,), dtype=int64)
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
Preprocessing helper function to encode numercial features, e.g. 0.1, 0.2, etc.
def create_numerical_encoder(dataset, name):
# Create a Normalization layer for our feature
normalizer = Normalization()
# Prepare a Dataset that only yields our feature
feature_ds = dataset.map(lambda x, y: x[name])
feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
# Learn the statistics of the data
normalizer.adapt(feature_ds)
return normalizer
# Apply normalization to a numerical feature
normalizer = create_numerical_encoder(ds, 'numerical_1')
normalizer.apply(xdf[name].values)
<tf.Tensor: shape=(4, 1), dtype=float32, numpy= array([[-0.7615536], [-1.2528784], [-0.7615536], [-1.2528784]], dtype=float32)>
Preprocessing helper function to encode integer categorical features, e.g. 1, 2, 3
def create_integer_categorical_encoder(dataset, name):
# Create a CategoryEncoding for our integer indices
encoder = CategoryEncoding(output_mode="binary")
# Prepare a Dataset that only yields our feature
feature_ds = dataset.map(lambda x, y: x[name])
feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
# Learn the space of possible indices
encoder.adapt(feature_ds)
return encoder
# Apply one-hot encoding to an integer categorical feature
encoder1 = create_integer_categorical_encoder(ds, 'categorical_integer_1')
encoder1.apply(xdf['categorical_integer_1'].values)
<tf.Tensor: shape=(4, 2), dtype=float32, numpy= array([[0., 1.], [1., 0.], [0., 1.], [1., 0.]], dtype=float32)>
# Apply one-hot encoding to an integer categorical feature
encoder2 = create_integer_categorical_encoder(ds, 'categorical_integer_2')
encoder2.apply(xdf['categorical_integer_2'].values)
<tf.Tensor: shape=(4, 5), dtype=float32, numpy= array([[0., 1., 0., 0., 0.], [0., 0., 1., 0., 0.], [0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]], dtype=float32)>
Preprocessing helper function to encode string categorical features, e.g. LOW, HIGH, MEDIUM.
This will applying the following to the input feature:
def create_string_categorical_encoder(dataset, name):
# Create a StringLookup layer which will turn strings into integer indices
index = StringLookup()
# Prepare a Dataset that only yields our feature
feature_ds = dataset.map(lambda x, y: x[name])
feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
# Learn the set of possible string values and assign them a fixed integer index
index.adapt(feature_ds)
# Create a CategoryEncoding for our integer indices
encoder = CategoryEncoding(output_mode="binary")
# Prepare a dataset of indices
feature_ds = feature_ds.map(index)
# Learn the space of possible indices
encoder.adapt(feature_ds)
return index, encoder
# Apply one-hot encoding to an integer categorical feature
indexer, encoder3 = create_string_categorical_encoder(ds, 'categorical_string')
# Turn the string input into integer indices
indices = indexer.apply(xdf['categorical_string'].values)
# Apply one-hot encoding to our indices
encoder3.apply(indices)
<tf.Tensor: shape=(4, 5), dtype=float32, numpy= array([[0., 0., 0., 0., 1.], [0., 0., 1., 0., 0.], [0., 0., 1., 0., 0.], [0., 0., 0., 1., 0.]], dtype=float32)>
Notice that the string categorical column was hot encoded into 5 tokens whereas in the input dataframe there is only 3 unique values. This is because the indexer adds 2 more tokens. See the vocabulary:
indexer.get_vocabulary()
['', '[UNK]', 'cat2', 'cat3', 'cat1']