require 'daru' shelter_data = Daru::DataFrame.from_csv 'data/animal_shelter_train.csv' p shelter_data.shape shelter_data.head(3) shelter_data.to_category 'OutcomeType', 'OutcomeSubtype', 'AnimalType', 'SexuponOutcome', 'Breed', 'Color' nil shelter_data['OutcomeType_Adoption'] = (shelter_data['OutcomeType'].contrast_code)['OutcomeType_Adoption'] shelter_data.head 3 small = shelter_data.head 600 small.head 3 p small['Breed'].categories.size, small['Color'].categories.size small['Breed'].frequencies.sort(ascending: false).head(10) other_cats = small['Breed'].categories.select { |i| small['Breed'].count(i) < 10 } other_cats_hash = other_cats.zip(['other']*other_cats.size).to_h small['Breed'].rename_categories other_cats_hash small['Breed'].frequencies small['Breed'].base_category = 'other' p small['Color'].categories.size small['Color'].frequencies.sort(ascending: false).head 10 other_cats = small['Color'].categories.select { |i| small['Color'].count(i) < 10 } other_cats_hash = other_cats.zip(['other']*other_cats.size).to_h small['Color'].rename_categories other_cats_hash small['Color'].frequencies small['Color'].base_category = 'other' small['SexuponOutcome'].frequencies p small['SexuponOutcome'].categories small['SexuponOutcome'].rename_categories nil => 'Unknown' small['SexuponOutcome'].categories train = small.head 500 test = small.tail 100 p train.size, test.size m = test['OutcomeType_Adoption'].mean "Trivial accuracy = #{[m, 1-m].max}" require 'statsample-glm' formula = 'OutcomeType_Adoption~AnimalType+Breed+AgeuponOutcome(Weeks)+Color+SexuponOutcome' glm_adoption = Statsample::GLM::Regression.new formula, train, :logistic glm_adoption.df_for_regression.head 5 glm_adoption.model.coefficients :hash predict = glm_adoption.predict test predict.map! { |i| i < 0.5 ? 0 : 1 } predict.head 5