#hide
%%bash
pip install -q tf-nightly

#hide
import pandas as pd
import tensorflow as tf
print('TF version: ', tf.__version__)

xdf = pd.DataFrame({
  'categorical_string': ['LOW', 'HIGH', 'HIGH', 'MEDIUM'],
  'categorical_integer_1': [1, 0, 1, 0],
  'categorical_integer_2': [1, 2, 3, 4],
  'numerical_1': [2.3, 0.2, 1.9, 5.8],
  'numerical_2': [16, 32, 8, 60]
})
ydf = pd.DataFrame({'target': [0, 0, 0, 1]})
ds = tf.data.Dataset.from_tensor_slices((dict(xdf), ydf))
for x, y in ds.take(1):
  print('X:', x)
  print('y:', y)

from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

def create_numerical_encoder(dataset, name):
    # Create a Normalization layer for our feature
    normalizer = Normalization()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the statistics of the data
    normalizer.adapt(feature_ds)

    return normalizer

# Apply normalization to a numerical feature
normalizer = create_numerical_encoder(ds, 'numerical_1')
normalizer.apply(xdf[name].values)

def create_integer_categorical_encoder(dataset, name):
    # Create a CategoryEncoding for our integer indices
    encoder = CategoryEncoding(output_mode="binary")

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the space of possible indices
    encoder.adapt(feature_ds)

    return encoder

# Apply one-hot encoding to an integer categorical feature
encoder1 = create_integer_categorical_encoder(ds, 'categorical_integer_1')
encoder1.apply(xdf['categorical_integer_1'].values)

# Apply one-hot encoding to an integer categorical feature
encoder2 = create_integer_categorical_encoder(ds, 'categorical_integer_2')
encoder2.apply(xdf['categorical_integer_2'].values)

def create_string_categorical_encoder(dataset, name):
    # Create a StringLookup layer which will turn strings into integer indices
    index = StringLookup()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the set of possible string values and assign them a fixed integer index
    index.adapt(feature_ds)

    # Create a CategoryEncoding for our integer indices
    encoder = CategoryEncoding(output_mode="binary")

    # Prepare a dataset of indices
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices
    encoder.adapt(feature_ds)

    return index, encoder

# Apply one-hot encoding to an integer categorical feature
indexer, encoder3 = create_string_categorical_encoder(ds, 'categorical_string')
# Turn the string input into integer indices
indices = indexer.apply(xdf['categorical_string'].values)
# Apply one-hot encoding to our indices
encoder3.apply(indices)

indexer.get_vocabulary()