require 'csv'
require 'open-uri'
csv = open("https://gist.githubusercontent.com/akhilstanislavose/46023ee10af448b9bb6a9656624cd03c/raw/ce0be9103e55f401f895b153ae6fc293cb91c241/titanic.csv").read
dataset = CSV.parse(csv, :headers => true)
dataset.count
891
def entropy(probablities)
-1 * probablities.reduce(0.0) { |sum,p| sum += p > 0 ? p * Math.log2(p) : 0 }
end
def gini(probablities)
probablities.reduce(0.0) { |sum,p| sum += p * (1 - p) }
end
def purity(mixtures, &block)
purity = mixtures.reduce(0.0) do |sum,m|
size = m.reduce(:+).to_f
sum += size > 0 ? size * yield(m.collect { |n| n/size }) : 0
end
purity / mixtures.flatten.reduce(:+)
end
:purity
died = dataset.select { |e| e['survived'] == '0' }.count
survived = dataset.select { |e| e['survived'] == '1' }.count
dataset_entropy = purity([[died,survived]]) { |ps| entropy(ps) }
dataset_gini = purity([[died,survived]]) { |ps| gini(ps) }
puts "Entropy of dataset = #{dataset_entropy}"
puts "Gini of dataset = #{dataset_gini}"
Entropy of dataset = 0.9607079018756469 Gini of dataset = 0.4730129578614427
survived_female = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' }.count
survived_male = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' }.count
died_female = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' }.count
died_male = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' }.count
gender_split = [[survived_female, died_female],[survived_male, died_male]]
gender_split_entropy = purity(gender_split) { |ps| entropy(ps) }
gender_split_gini = purity(gender_split) { |ps| gini(ps) }
puts "Information Gain after gender split using Entropy = #{dataset_entropy - gender_split_entropy}"
puts "Information Gain after gender split using Gini = #{dataset_gini - gender_split_gini}"
Information Gain after gender split using Entropy = 0.2176601066606143 Information Gain after gender split using Gini = 0.13964795747285225
survived_pclass_1 = dataset.select { |e| e['survived'] == '1' && e['pclass'] == '1' }.count
survived_pclass_2 = dataset.select { |e| e['survived'] == '1' && e['pclass'] == '2' }.count
survived_pclass_3 = dataset.select { |e| e['survived'] == '1' && e['pclass'] == '3' }.count
died_pclass_1 = dataset.select { |e| e['survived'] == '0' && e['pclass'] == '1' }.count
died_pclass_2 = dataset.select { |e| e['survived'] == '0' && e['pclass'] == '2' }.count
died_pclass_3 = dataset.select { |e| e['survived'] == '0' && e['pclass'] == '3' }.count
pclass_split = [
[survived_pclass_1,died_pclass_1],
[survived_pclass_2,died_pclass_2],
[survived_pclass_3,died_pclass_3]
]
pclass_split_entropy = purity(pclass_split) { |ps| entropy(ps) }
pclass_split_gini = purity(pclass_split) { |ps| gini(ps) }
puts "Information Gain after pclass split using Entropy = #{dataset_entropy - pclass_split_entropy}"
puts "Information Gain after pclass split using Gini = #{dataset_gini - pclass_split_gini}"
Information Gain after pclass split using Entropy = 0.08383104529601149 Information Gain after pclass split using Gini = 0.05462157677138346
embarked_types = dataset.collect { |e| e['embarked'] }.uniq
puts "Embarked types = #{embarked_types}"
survived_embarked_S = dataset.select { |e| e['survived'] == '1' && e['embarked'] == 'S' }.count
survived_embarked_C = dataset.select { |e| e['survived'] == '1' && e['embarked'] == 'C' }.count
survived_embarked_Q = dataset.select { |e| e['survived'] == '1' && e['embarked'] == 'Q' }.count
survived_embarked_NA = dataset.select { |e| e['survived'] == '1' && e['embarked'] == 'NA' }.count
died_embarked_S = dataset.select { |e| e['survived'] == '0' && e['embarked'] == 'S' }.count
died_embarked_C = dataset.select { |e| e['survived'] == '0' && e['embarked'] == 'C' }.count
died_embarked_Q = dataset.select { |e| e['survived'] == '0' && e['embarked'] == 'Q' }.count
died_embarked_NA = dataset.select { |e| e['survived'] == '0' && e['embarked'] == 'NA' }.count
embarked_split = [
[survived_embarked_S, died_embarked_S],
[survived_embarked_C, died_embarked_C],
[survived_embarked_Q, died_embarked_Q],
[survived_embarked_NA, died_embarked_NA]
]
embarked_split_entropy = purity(embarked_split) { |ps| entropy(ps) }
embarked_split_gini = purity(embarked_split) { |ps| gini(ps) }
puts "Information Gain after embarked split using Entropy = #{dataset_entropy - embarked_split_entropy}"
puts "Information Gain after embarked split using Gini = #{dataset_gini - embarked_split_gini}"
Embarked types = ["S", "C", "Q", "NA"] Information Gain after embarked split using Entropy = 0.024047090707960517 Information Gain after embarked split using Gini = 0.015751498294317823
IG(gender_split) > IG(pclass_split) > IG(embarked_split)
survived_male_embarked_S = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['embarked'] == 'S' }.count
survived_male_embarked_C = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['embarked'] == 'C' }.count
survived_male_embarked_Q = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['embarked'] == 'Q' }.count
survived_male_embarked_NA = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['embarked'] == 'NA' }.count
died_male_embarked_S = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['embarked'] == 'S' }.count
died_male_embarked_C = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['embarked'] == 'C' }.count
died_male_embarked_Q = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['embarked'] == 'Q' }.count
died_male_embarked_NA = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['embarked'] == 'NA' }.count
male_embarked_split = [
[survived_male_embarked_S, died_male_embarked_S],
[survived_male_embarked_C, died_male_embarked_C],
[survived_male_embarked_Q, died_male_embarked_Q],
[survived_male_embarked_NA, died_male_embarked_NA]
]
male_embarked_split_entropy = purity(male_embarked_split) { |ps| entropy(ps) }
male_embarked_split_gini = purity(male_embarked_split) { |ps| gini(ps) }
puts "Information Gain after male-embarked split using Entropy = #{gender_split_entropy - male_embarked_split_entropy}"
puts "Information Gain after male-embarked split using Gini = #{gender_split_gini - male_embarked_split_gini}"
Information Gain after male-embarked split using Entropy = 0.05942132128706379 Information Gain after male-embarked split using Gini = 0.033595808771653746
survived_male_pclass_1 = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['pclass'] == '1' }.count
survived_male_pclass_2 = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['pclass'] == '2' }.count
survived_male_pclass_3 = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['pclass'] == '3' }.count
died_male_pclass_1 = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['pclass'] == '1' }.count
died_male_pclass_2 = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['pclass'] == '2' }.count
died_male_pclass_3 = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['pclass'] == '3' }.count
male_pclass_split = [
[survived_male_pclass_1,died_male_pclass_1],
[survived_male_pclass_2,died_male_pclass_2],
[survived_male_pclass_3,died_male_pclass_3]
]
male_pclass_split_entropy = purity(male_pclass_split) { |ps| entropy(ps) }
male_pclass_split_gini = purity(male_pclass_split) { |ps| gini(ps) }
puts "Information Gain after male-pclass split using Entropy = #{gender_split_entropy - male_pclass_split_entropy}"
puts "Information Gain after male-pclass split using Gini = #{gender_split_gini - male_pclass_split_gini}"
Information Gain after male-pclass split using Entropy = 0.08056051348939974 Information Gain after male-pclass split using Gini = 0.04442316613547043
survived_female_embarked_S = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['embarked'] == 'S' }.count
survived_female_embarked_C = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['embarked'] == 'C' }.count
survived_female_embarked_Q = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['embarked'] == 'Q' }.count
survived_female_embarked_NA = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['embarked'] == 'NA' }.count
died_female_embarked_S = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['embarked'] == 'S' }.count
died_female_embarked_C = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['embarked'] == 'C' }.count
died_female_embarked_Q = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['embarked'] == 'Q' }.count
died_female_embarked_NA = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['embarked'] == 'NA' }.count
female_embarked_split = [
[survived_female_embarked_S, died_female_embarked_S],
[survived_female_embarked_C, died_female_embarked_C],
[survived_female_embarked_Q, died_female_embarked_Q],
[survived_female_embarked_NA, died_female_embarked_NA]
]
female_embarked_split_entropy = purity(female_embarked_split) { |ps| entropy(ps) }
female_embarked_split_gini = purity(female_embarked_split) { |ps| gini(ps) }
puts "Information Gain after female-embarked split using Entropy = #{gender_split_entropy - female_embarked_split_entropy}"
puts "Information Gain after female-embarked split using Gini = #{gender_split_gini - female_embarked_split_gini}"
Information Gain after female-embarked split using Entropy = -0.052903841414234765 Information Gain after female-embarked split using Gini = -0.0366266354137858
survived_female_pclass_1 = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['pclass'] == '1' }.count
survived_female_pclass_2 = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['pclass'] == '2' }.count
survived_female_pclass_3 = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['pclass'] == '3' }.count
died_female_pclass_1 = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['pclass'] == '1' }.count
died_female_pclass_2 = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['pclass'] == '2' }.count
died_female_pclass_3 = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['pclass'] == '3' }.count
female_pclass_split = [
[survived_female_pclass_1,died_female_pclass_1],
[survived_female_pclass_2,died_female_pclass_2],
[survived_female_pclass_3,died_female_pclass_3]
]
female_pclass_split_entropy = purity(female_pclass_split) { |ps| entropy(ps) }
female_pclass_split_gini = purity(female_pclass_split) { |ps| gini(ps) }
puts "Information Gain after female-pclass split using Entropy = #{gender_split_entropy - female_pclass_split_entropy}"
puts "Information Gain after female-pclass split using Gini = #{gender_split_gini - female_pclass_split_gini}"
Information Gain after female-pclass split using Entropy = 0.12696481072875554 Information Gain after female-pclass split using Gini = 0.05036773218080309