This code is heavily derived from Decision Tree from a Scratch (ID3 algorithm)

In [1]:

import numpy as np
import pandas as pd

eps = np.finfo(float).eps
dataset = {'Taste': ['Salty', 'Spicy', 'Spicy', 'Spicy', 'Spicy', 'Sweet', 'Salty', 'Sweet', 'Spicy', 'Salty'],
           'Temperature': ['Hot', 'Hot', 'Hot', 'Cold', 'Hot', 'Cold', 'Cold', 'Hot', 'Cold', 'Hot'],
           'Texture': ['Soft', 'Soft', 'Hard', 'Hard', 'Hard', 'Soft', 'Soft', 'Soft', 'Soft', 'Hard'],
           'Eat': ['No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes']}

df = pd.DataFrame(dataset, columns=['Taste', 'Temperature', 'Texture', 'Eat'])

In [2]:

def find_entropy(df):
    entropy = 0
    for value in ('Yes', 'No'):
        fraction = df['Eat'].value_counts()[value] / len(df['Eat'])
        entropy += -fraction * np.log2(fraction)
    return entropy

In [3]:

df['Eat'].value_counts()['Yes']

Out[3]:

In [4]:

df['Taste'][df['Taste'] == 'Salty']

Out[4]:

0    Salty
6    Salty
9    Salty
Name: Taste, dtype: object

In [5]:

len(df[df['Eat'] == 'Yes'])

Out[5]:

In [6]:

find_entropy(df)

Out[6]:

0.9709505944546686

In [7]:

def find_entropy_attribute(df, attribute):
    entropy_sum = 0
    for variable in df[attribute].unique():
        entropy = 0
        for target_variable in ('Yes', 'No'):
            num = len(df[attribute][df[attribute] == variable][df['Eat'] == target_variable])
            den = len(df[attribute][df[attribute] == variable])
            fraction = num / (den + eps)
            entropy += -fraction * np.log2(fraction + eps)
        entropy_weights = den / len(df)
        entropy_sum += -entropy_weights * entropy
    return abs(entropy_sum)

In [8]:

def build_tree(df):
    IG = []
    for key in df.keys()[:-1]:
        IG.append(find_entropy(df) - find_entropy_attribute(df, key))
    node = df.keys()[:-1][np.argmax(IG)]  # Get attribute with maximum information gain

    tree = {}
    tree[node] = {}

    for value in np.unique(df[node]):
        sub_table = df[df[node] == value].reset_index(drop=True)
        cls, counts = np.unique(sub_table['Eat'], return_counts=True)

        if len(counts) == 1:  # Checking purity of subset
            tree[node][value] = cls[0]
        else:
            tree[node][value] = build_tree(sub_table)  # Calling the function recursively

    return tree

In [9]:

tree = build_tree(df)

import pprint
pprint.pprint(tree)

{'Taste': {'Salty': {'Texture': {'Hard': 'Yes', 'Soft': 'No'}},
           'Spicy': {'Temperature': {'Cold': {'Texture': {'Hard': 'No',
                                                          'Soft': 'Yes'}},
                                     'Hot': {'Texture': {'Hard': 'Yes',
                                                         'Soft': 'No'}}}},
           'Sweet': 'Yes'}}

In [10]:

def predict(inst, tree):
    for nodes in tree.keys():
        value = inst[nodes]
        tree = tree[nodes][value]
        prediction = 0

        if type(tree) is dict:
            prediction = predict(inst, tree)
        else:
            prediction = tree
            break

    return prediction

In [11]:

inst = df.iloc[6][:-1]
inst

Out[11]:

Taste          Salty
Temperature     Cold
Texture         Soft
Name: 6, dtype: object

In [12]:

df.iloc[6][-1]

Out[12]:

'No'

In [13]:

prediction = predict(inst, tree)
prediction

Out[13]:

'No'