Tatanic Dataset Assignment

In [1]:
require 'csv'
require 'open-uri'

csv = open("https://gist.githubusercontent.com/akhilstanislavose/46023ee10af448b9bb6a9656624cd03c/raw/ce0be9103e55f401f895b153ae6fc293cb91c241/titanic.csv").read
dataset = CSV.parse(csv, :headers => true)

dataset.count
Out[1]:
891
In [2]:
def entropy(probablities)
  -1 * probablities.reduce(0.0) { |sum,p| sum += p > 0 ? p * Math.log2(p) : 0 }
end

def gini(probablities)
  probablities.reduce(0.0) { |sum,p| sum += p * (1 - p) }
end

def purity(mixtures, &block)
  purity = mixtures.reduce(0.0) do |sum,m|
    size = m.reduce(:+).to_f
    sum += size > 0 ? size * yield(m.collect { |n| n/size }) : 0
  end
  purity / mixtures.flatten.reduce(:+)
end
Out[2]:
:purity
In [3]:
died     = dataset.select { |e| e['survived'] == '0' }.count
survived = dataset.select { |e| e['survived'] == '1' }.count

dataset_entropy = purity([[died,survived]]) { |ps| entropy(ps) }
dataset_gini    = purity([[died,survived]]) { |ps| gini(ps) }

puts "Entropy of dataset = #{dataset_entropy}"
puts "Gini of dataset    = #{dataset_gini}"
Entropy of dataset = 0.9607079018756469
Gini of dataset    = 0.4730129578614427
In [4]:
survived_female = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' }.count
survived_male   = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male'   }.count

died_female     = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' }.count
died_male       = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male'   }.count

gender_split = [[survived_female, died_female],[survived_male, died_male]]

gender_split_entropy = purity(gender_split) { |ps| entropy(ps) }
gender_split_gini    = purity(gender_split) { |ps| gini(ps)    }

puts "Information Gain after gender split using Entropy = #{dataset_entropy - gender_split_entropy}"
puts "Information Gain after gender split using Gini    = #{dataset_gini - gender_split_gini}"
Information Gain after gender split using Entropy = 0.2176601066606143
Information Gain after gender split using Gini    = 0.13964795747285225
In [5]:
survived_pclass_1 = dataset.select { |e| e['survived'] == '1' && e['pclass'] == '1' }.count
survived_pclass_2 = dataset.select { |e| e['survived'] == '1' && e['pclass'] == '2' }.count
survived_pclass_3 = dataset.select { |e| e['survived'] == '1' && e['pclass'] == '3' }.count

died_pclass_1     = dataset.select { |e| e['survived'] == '0' && e['pclass'] == '1' }.count
died_pclass_2     = dataset.select { |e| e['survived'] == '0' && e['pclass'] == '2' }.count
died_pclass_3     = dataset.select { |e| e['survived'] == '0' && e['pclass'] == '3' }.count

pclass_split = [
  [survived_pclass_1,died_pclass_1],
  [survived_pclass_2,died_pclass_2],
  [survived_pclass_3,died_pclass_3]
]

pclass_split_entropy = purity(pclass_split) { |ps| entropy(ps) }
pclass_split_gini    = purity(pclass_split) { |ps| gini(ps)    }

puts "Information Gain after pclass split using Entropy = #{dataset_entropy - pclass_split_entropy}"
puts "Information Gain after pclass split using Gini    = #{dataset_gini - pclass_split_gini}"
Information Gain after pclass split using Entropy = 0.08383104529601149
Information Gain after pclass split using Gini    = 0.05462157677138346
In [6]:
embarked_types = dataset.collect { |e| e['embarked'] }.uniq
puts "Embarked types = #{embarked_types}"

survived_embarked_S  = dataset.select { |e| e['survived'] == '1' && e['embarked'] == 'S' }.count
survived_embarked_C  = dataset.select { |e| e['survived'] == '1' && e['embarked'] == 'C' }.count
survived_embarked_Q  = dataset.select { |e| e['survived'] == '1' && e['embarked'] == 'Q' }.count
survived_embarked_NA = dataset.select { |e| e['survived'] == '1' && e['embarked'] == 'NA' }.count

died_embarked_S      = dataset.select { |e| e['survived'] == '0' && e['embarked'] == 'S' }.count
died_embarked_C      = dataset.select { |e| e['survived'] == '0' && e['embarked'] == 'C' }.count
died_embarked_Q      = dataset.select { |e| e['survived'] == '0' && e['embarked'] == 'Q' }.count
died_embarked_NA     = dataset.select { |e| e['survived'] == '0' && e['embarked'] == 'NA' }.count

embarked_split = [
  [survived_embarked_S, died_embarked_S],
  [survived_embarked_C, died_embarked_C],
  [survived_embarked_Q, died_embarked_Q],
  [survived_embarked_NA, died_embarked_NA]
]

embarked_split_entropy = purity(embarked_split) { |ps| entropy(ps) }
embarked_split_gini    = purity(embarked_split) { |ps| gini(ps)    }

puts "Information Gain after embarked split using Entropy = #{dataset_entropy - embarked_split_entropy}"
puts "Information Gain after embarked split using Gini    = #{dataset_gini - embarked_split_gini}"
Embarked types = ["S", "C", "Q", "NA"]
Information Gain after embarked split using Entropy = 0.024047090707960517
Information Gain after embarked split using Gini    = 0.015751498294317823

IG(gender_split) > IG(pclass_split) > IG(embarked_split)

In [7]:
survived_male_embarked_S  = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['embarked'] == 'S' }.count
survived_male_embarked_C  = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['embarked'] == 'C' }.count
survived_male_embarked_Q  = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['embarked'] == 'Q' }.count
survived_male_embarked_NA = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['embarked'] == 'NA' }.count

died_male_embarked_S  = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['embarked'] == 'S' }.count
died_male_embarked_C  = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['embarked'] == 'C' }.count
died_male_embarked_Q  = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['embarked'] == 'Q' }.count
died_male_embarked_NA = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['embarked'] == 'NA' }.count

male_embarked_split = [
  [survived_male_embarked_S, died_male_embarked_S],
  [survived_male_embarked_C,  died_male_embarked_C],
  [survived_male_embarked_Q,  died_male_embarked_Q],
  [survived_male_embarked_NA, died_male_embarked_NA]
]

male_embarked_split_entropy = purity(male_embarked_split) { |ps| entropy(ps) }
male_embarked_split_gini    = purity(male_embarked_split) { |ps| gini(ps)    }

puts "Information Gain after male-embarked split using Entropy = #{gender_split_entropy - male_embarked_split_entropy}"
puts "Information Gain after male-embarked split using Gini    = #{gender_split_gini - male_embarked_split_gini}"
Information Gain after male-embarked split using Entropy = 0.05942132128706379
Information Gain after male-embarked split using Gini    = 0.033595808771653746
In [8]:
survived_male_pclass_1  = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['pclass'] == '1' }.count
survived_male_pclass_2  = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['pclass'] == '2' }.count
survived_male_pclass_3  = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'male' && e['pclass'] == '3' }.count

died_male_pclass_1  = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['pclass'] == '1' }.count
died_male_pclass_2  = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['pclass'] == '2' }.count
died_male_pclass_3  = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'male' && e['pclass'] == '3' }.count

male_pclass_split = [
  [survived_male_pclass_1,died_male_pclass_1],
  [survived_male_pclass_2,died_male_pclass_2],
  [survived_male_pclass_3,died_male_pclass_3]
]

male_pclass_split_entropy = purity(male_pclass_split) { |ps| entropy(ps) }
male_pclass_split_gini    = purity(male_pclass_split) { |ps| gini(ps)    }

puts "Information Gain after male-pclass split using Entropy = #{gender_split_entropy - male_pclass_split_entropy}"
puts "Information Gain after male-pclass split using Gini    = #{gender_split_gini - male_pclass_split_gini}"
Information Gain after male-pclass split using Entropy = 0.08056051348939974
Information Gain after male-pclass split using Gini    = 0.04442316613547043
In [9]:
survived_female_embarked_S  = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['embarked'] == 'S' }.count
survived_female_embarked_C  = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['embarked'] == 'C' }.count
survived_female_embarked_Q  = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['embarked'] == 'Q' }.count
survived_female_embarked_NA = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['embarked'] == 'NA' }.count

died_female_embarked_S  = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['embarked'] == 'S' }.count
died_female_embarked_C  = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['embarked'] == 'C' }.count
died_female_embarked_Q  = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['embarked'] == 'Q' }.count
died_female_embarked_NA = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['embarked'] == 'NA' }.count

female_embarked_split = [
  [survived_female_embarked_S, died_female_embarked_S],
  [survived_female_embarked_C,  died_female_embarked_C],
  [survived_female_embarked_Q,  died_female_embarked_Q],
  [survived_female_embarked_NA, died_female_embarked_NA]
]

female_embarked_split_entropy = purity(female_embarked_split) { |ps| entropy(ps) }
female_embarked_split_gini    = purity(female_embarked_split) { |ps| gini(ps)    }

puts "Information Gain after female-embarked split using Entropy = #{gender_split_entropy - female_embarked_split_entropy}"
puts "Information Gain after female-embarked split using Gini    = #{gender_split_gini - female_embarked_split_gini}"
Information Gain after female-embarked split using Entropy = -0.052903841414234765
Information Gain after female-embarked split using Gini    = -0.0366266354137858
In [10]:
survived_female_pclass_1  = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['pclass'] == '1' }.count
survived_female_pclass_2  = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['pclass'] == '2' }.count
survived_female_pclass_3  = dataset.select { |e| e['survived'] == '1' && e['sex'] == 'female' && e['pclass'] == '3' }.count

died_female_pclass_1  = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['pclass'] == '1' }.count
died_female_pclass_2  = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['pclass'] == '2' }.count
died_female_pclass_3  = dataset.select { |e| e['survived'] == '0' && e['sex'] == 'female' && e['pclass'] == '3' }.count

female_pclass_split = [
  [survived_female_pclass_1,died_female_pclass_1],
  [survived_female_pclass_2,died_female_pclass_2],
  [survived_female_pclass_3,died_female_pclass_3]
]

female_pclass_split_entropy = purity(female_pclass_split) { |ps| entropy(ps) }
female_pclass_split_gini    = purity(female_pclass_split) { |ps| gini(ps)    }

puts "Information Gain after female-pclass split using Entropy = #{gender_split_entropy - female_pclass_split_entropy}"
puts "Information Gain after female-pclass split using Gini    = #{gender_split_gini - female_pclass_split_gini}"
Information Gain after female-pclass split using Entropy = 0.12696481072875554
Information Gain after female-pclass split using Gini    = 0.05036773218080309