#!/usr/bin/env python # coding: utf-8 # The decision tree algorithm is a supervised learning algorithm -- we first construct the tree with historical data, and then use it to predict an outcome. One of the major advantages of decision trees is that they can pick up nonlinear interactions between variables in the data that linear regression can't. In our bear wrestling example, a decision tree can pick up on the fact that you should only wrestle large bears when escape is impossible, whereas a linear regression would have had to weight both factors in the absence of the other. # # We can use trees for classification or regression problems. In this lesson, we'll walk through the building blocks of making a decision tree automatically. # In[17]: import pandas as pd import math import numpy as np import sys sys.setrecursionlimit(10000) # Set index_col to False to avoid pandas thinking that the first column is row indexes (it's age) income = pd.read_csv('/Users/ememobongekpenyong/Desktop/DQ/income.csv', index_col=False) income.head() # ### Categorical Variables # # As we can see in our data, we have categorical variables such as workclass that have string values. Multiple individuals can share the same string value. The types of work include State-gov, Self-emp-not-inc, Private, and so on. Each of these strings is a label for a category. Another example of a column of categories is sex, where the options are Male and Female. # # Before we get started with decision trees, we need to convert the categorical variables in our data set to numeric variables. This involves assigning a number to each category label, then converting all of the labels in a column to the corresponding numbers. # # One strategy is to convert the columns to a categorical type. Under this approach, pandas will display the labels as strings, but internally store them as numbers so we can do computations with them. The numbers aren't always compatible with other libraries like Scikit-learn, though, so it's easier to just do the conversion to numeric upfront. # # We can use the pandas.Categorical() class from pandas to perform the conversion to numbers. # In[2]: cols = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country", "high_income"] for name in cols: col = pd.Categorical(income[name]) income[name] = col.codes income.head() # In[3]: # Split income into two parts based on the value of the workclass column. private_incomes = income[income['workclass'] == 4] public_incomes = income[income['workclass'] != 4] print('private income:',len(private_incomes), 'public income:', len(public_incomes)) # When we performed the split, 9865 rows went to the left, where workclass does not equal 4, and 22696 rows went to the right, where workclass equals 4 # ### Splitting data to make predictions # # The nodes at the bottom of the tree, where we decide to stop splitting, are called terminal nodes, or leaves. When we do our splits, we aren't doing them randomly; we have an objective. Our goal is to ensure that we can make a prediction on future data. In order to do this, all rows in each leaf must have only one value for our target column. # # We're trying to predict the high_income column. # If high_income is 1, it means that the person has an income higher than 50k a year. # If high_income is 0, it means that they have an income less than or equal to 50k a year. # # After constructing a tree using the income data, we'll want to make predictions. In order to do this, we'll take a new row and feed it through our decision tree. # # First, we check whether the person works in the private sector. If they do, we'll check to see whether they're native to the US, and so on. # # Eventually, we'll reach a leaf. The leaf will tell us what value we should predict for high_income. # # In order to be able to make this prediction, all of the rows in a leaf should only have a single value for high_income. This means that leaves can't have both 0 and 1 values in the high_income column. Each leaf can only have rows with the same values for our target column. If this isn't the case, we won't be able to make effective predictions. # # We'll need to continue splitting nodes until we get to a point where all of the rows in a node have the same value for high_income. # In[4]: # Compute the entropy of the high_income column in the income dataframe, and assign the result to income_entropy. prob_0 = income[income['high_income'] == 0].shape[0]/income.shape[0] prob_1 = income[income['high_income'] == 1].shape[0]/income.shape[0] income_entropy = -((prob_0 * math.log(prob_0, 2)) + (prob_1 * math.log(prob_1, 2))) income_entropy # ### Information Gain # # We'll need a way to go from computing entropy to figuring out which variable to split on. We can do this using information gain, which tells us which split will reduce entropy the most. # # Here's an alternate explanation. We're finding the entropy of each set post-split, weighting it by the number of items in each split, then subtracting from the current entropy. If the result is positive, we've lowered entropy with our split. The higher the result is, the more we've lowered entropy. # # One strategy for constructing trees is to create as many branches at each node as there are unique values for the variable we're splitting on. So if the variable has three or four values, we'd end up with three or four branches. This approach usually involves more complexity than it's worth and doesn't improve prediction accuracy, but it's worth knowing about. # # To simplify the calculation of information gain and make splits simpler, we won't do it for each unique value. We'll find the median for the variable we're splitting on instead. Any rows where the value of the variable is below the median will go to the left branch, and the rest of the rows will go to the right branch. To compute information gain, we'll only have to compute entropies for two subsets. # In[5]: # Compute the information gain for splitting on the age column of income. def calc_entropy(column): """ Calculate entropy given a pandas series, list, or numpy array. """ # Compute the counts of each unique value in the column counts = np.bincount(column) # Divide by the total column length to get a probability probabilities = counts/len(column) # Initialize the entropy to 0 entropy = 0 for prob in probabilities: if prob > 0: entropy += prob * math.log(prob, 2) return -entropy entropy = calc_entropy([1,1,0,0,1]) entropy # In[6]: income_entropy = calc_entropy(income["high_income"]) median_age = income["age"].median() left_split = income[income["age"] <= median_age] right_split = income[income["age"] > median_age] age_information_gain = income_entropy - ((left_split.shape[0] / income.shape[0]) * calc_entropy(left_split["high_income"]) + ((right_split.shape[0] / income.shape[0]) * calc_entropy(right_split["high_income"]))) age_information_gain # ### Finding the best split # # Now that we know how to compute information gain, we can determine the best variable to split a node on. When we start our tree, we want to make an initial split. We'll find the variable to split on by calculating which split would have the highest information gain. # In[7]: def calc_information_gain(data, split_name, target_name): """ Calculate information gain given a data set, column to split on, and target """ # Calculate the original entropy original_entropy = calc_entropy(data[target_name]) # Find the median of the column we're splitting column = data[split_name] median = column.median() # Make two subsets of the data, based on the median left_split = data[column <= median] right_split = data[column > median] # Loop through the splits and calculate the subset entropies to_subtract = 0 for subset in [left_split, right_split]: prob = (subset.shape[0] / data.shape[0]) to_subtract += prob * calc_entropy(subset[target_name]) # Return information gain return original_entropy - to_subtract # Verify that our answer is the same as on the last screen # print(calc_information_gain(income, "age", "high_income") columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"] information_gains = [] # Loop through and compute information gains for col in columns: information_gain = calc_information_gain(income, col, 'high_income') information_gains.append(information_gain) # Find the name of the column with the highest gain highest_gain_index = information_gains.index(max(information_gains)) highest_gain = columns[highest_gain_index] highest_gain # In[8]: information_gains # In[15]: # Write a function named find_best_column() that returns the name of a column to split the data on. We've started to define this function for you. def find_best_column(data, target_name, columns): information_gains = [] # Loop through and compute information gains for col in columns: information_gain = calc_information_gain(data, col, "high_income") information_gains.append(information_gain) # Find the name of the column with the highest gain highest_gain_index = information_gains.index(max(information_gains)) highest_gain = columns[highest_gain_index] return highest_gain income_split = find_best_column(income, "high_income", columns) income_split # ### Creating a Simple Recursive Algorithm # # **We'll use lists to store our labels for nodes (when we find them)** # # **Lists can be accessed inside our recursive function, whereas integers can't.** # In[21]: # Create the data set that we used in the example on the last screen data = pd.DataFrame([ [0,20,0], [0,60,2], [0,40,1], [1,25,1], [1,35,2], [1,55,1] ]) # Assign column names to the data data.columns = ["high_income", "age", "marital_status"] # Call the function on our data to set the counters properly id3(data, "high_income", ["age", "marital_status"]) label_1s = [] label_0s = [] def id3(data, target, columns): unique_targets = pd.unique(data[target]) if len(unique_targets) == 1: if 0 in unique_targets: label_0s.append(0) elif 1 in unique_targets: label_1s.append(1) return best_column = find_best_column(data, target, columns) column_median = data[best_column].median() left_split = data[data[best_column] <= column_median] right_split = data[data[best_column] > column_median] for split in [left_split, right_split]: id3(split, target, columns) id3(data, "high_income", ["age", "marital_status"]) # In[ ]: # In[ ]: