#hide %%bash pip install -q tf-nightly #hide import pandas as pd import tensorflow as tf print('TF version: ', tf.__version__) xdf = pd.DataFrame({ 'categorical_string': ['LOW', 'HIGH', 'HIGH', 'MEDIUM'], 'categorical_integer_1': [1, 0, 1, 0], 'categorical_integer_2': [1, 2, 3, 4], 'numerical_1': [2.3, 0.2, 1.9, 5.8], 'numerical_2': [16, 32, 8, 60] }) ydf = pd.DataFrame({'target': [0, 0, 0, 1]}) ds = tf.data.Dataset.from_tensor_slices((dict(xdf), ydf)) for x, y in ds.take(1): print('X:', x) print('y:', y) from tensorflow.keras.layers.experimental.preprocessing import Normalization from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding from tensorflow.keras.layers.experimental.preprocessing import StringLookup def create_numerical_encoder(dataset, name): # Create a Normalization layer for our feature normalizer = Normalization() # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) # Learn the statistics of the data normalizer.adapt(feature_ds) return normalizer # Apply normalization to a numerical feature normalizer = create_numerical_encoder(ds, 'numerical_1') normalizer.apply(xdf[name].values) def create_integer_categorical_encoder(dataset, name): # Create a CategoryEncoding for our integer indices encoder = CategoryEncoding(output_mode="binary") # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) # Learn the space of possible indices encoder.adapt(feature_ds) return encoder # Apply one-hot encoding to an integer categorical feature encoder1 = create_integer_categorical_encoder(ds, 'categorical_integer_1') encoder1.apply(xdf['categorical_integer_1'].values) # Apply one-hot encoding to an integer categorical feature encoder2 = create_integer_categorical_encoder(ds, 'categorical_integer_2') encoder2.apply(xdf['categorical_integer_2'].values) def create_string_categorical_encoder(dataset, name): # Create a StringLookup layer which will turn strings into integer indices index = StringLookup() # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) # Learn the set of possible string values and assign them a fixed integer index index.adapt(feature_ds) # Create a CategoryEncoding for our integer indices encoder = CategoryEncoding(output_mode="binary") # Prepare a dataset of indices feature_ds = feature_ds.map(index) # Learn the space of possible indices encoder.adapt(feature_ds) return index, encoder # Apply one-hot encoding to an integer categorical feature indexer, encoder3 = create_string_categorical_encoder(ds, 'categorical_string') # Turn the string input into integer indices indices = indexer.apply(xdf['categorical_string'].values) # Apply one-hot encoding to our indices encoder3.apply(indices) indexer.get_vocabulary()