This code is heavily derived from Decision Tree from a Scratch

In [1]:
import numpy as np
import pandas as pd

eps = np.finfo(float).eps
dataset = {'Taste': ['Salty', 'Spicy', 'Spicy', 'Spicy', 'Spicy', 'Sweet', 'Salty', 'Sweet', 'Spicy', 'Salty'],
           'Temperature': ['Hot', 'Hot', 'Hot', 'Cold', 'Hot', 'Cold', 'Cold', 'Hot', 'Cold', 'Hot'],
           'Texture': ['Soft', 'Soft', 'Hard', 'Hard', 'Hard', 'Soft', 'Soft', 'Soft', 'Soft', 'Hard'],
           'Eat': ['No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes']}

df = pd.DataFrame(dataset, columns=['Taste', 'Temperature', 'Texture', 'Eat'])
In [2]:
def find_entropy(df):
    entropy = 0
    for value in ('Yes', 'No'):
        fraction = df['Eat'].value_counts()[value] / len(df['Eat'])
        entropy += -fraction * np.log2(fraction)
    return entropy
In [3]:
df['Eat'].value_counts()['Yes']
Out[3]:
6
In [4]:
df['Taste'][df['Taste'] == 'Salty']
Out[4]:
0    Salty
6    Salty
9    Salty
Name: Taste, dtype: object
In [5]:
len(df[df['Eat'] == 'Yes'])
Out[5]:
6
In [6]:
find_entropy(df)
Out[6]:
0.9709505944546686
In [7]:
def find_entropy_attribute(df, attribute):
    entropy_sum = 0
    for variable in df[attribute].unique():
        entropy = 0
        for target_variable in ('Yes', 'No'):
            num = len(df[attribute][df[attribute] == variable][df['Eat'] == target_variable])
            den = len(df[attribute][df[attribute] == variable])
            fraction = num / (den + eps)
            entropy += -fraction * np.log2(fraction + eps)
        entropy_weights = den / len(df)
        entropy_sum += -entropy_weights * entropy
    return abs(entropy_sum)
In [8]:
def build_tree(df):
    IG = []
    for key in df.keys()[:-1]:
        IG.append(find_entropy(df) - find_entropy_attribute(df, key))
    node = df.keys()[:-1][np.argmax(IG)]  # Get attribute with maximum information gain

    tree = {}
    tree[node] = {}

    for value in np.unique(df[node]):
        sub_table = df[df[node] == value].reset_index(drop=True)
        cls, counts = np.unique(sub_table['Eat'], return_counts=True)

        if len(counts) == 1:  # Checking purity of subset
            tree[node][value] = cls[0]
        else:
            tree[node][value] = build_tree(sub_table)  # Calling the function recursively

    return tree
In [9]:
tree = build_tree(df)

import pprint
pprint.pprint(tree)
{'Taste': {'Salty': {'Texture': {'Hard': 'Yes', 'Soft': 'No'}},
           'Spicy': {'Temperature': {'Cold': {'Texture': {'Hard': 'No',
                                                          'Soft': 'Yes'}},
                                     'Hot': {'Texture': {'Hard': 'Yes',
                                                         'Soft': 'No'}}}},
           'Sweet': 'Yes'}}
In [10]:
def predict(inst, tree):
    for nodes in tree.keys():
        value = inst[nodes]
        tree = tree[nodes][value]
        prediction = 0

        if type(tree) is dict:
            prediction = predict(inst, tree)
        else:
            prediction = tree
            break

    return prediction
In [11]:
inst = df.iloc[6][:-1]
inst
Out[11]:
Taste          Salty
Temperature     Cold
Texture         Soft
Name: 6, dtype: object
In [12]:
df.iloc[6][-1]
Out[12]:
'No'
In [13]:
prediction = predict(inst, tree)
prediction
Out[13]:
'No'