In [ ]:
import quilt
from quilt.data.uciml import iris
In [ ]:
iris
In [ ]:
train = iris.tables.bezdek_iris()
trainvecs = train.values[:,:4]
labels = train['class'].values
In [ ]:
from sklearn import svm
C = 1.0 # magic regularization parameter
models = (svm.SVC(kernel='linear', C=C),
          svm.LinearSVC(C=C),
          svm.SVC(kernel='rbf', gamma=0.7, C=C),
          svm.SVC(kernel='poly', degree=3, C=C))
models = (clf.fit(trainvecs, labels) for clf in models)
In [ ]:
from sklearn.model_selection import cross_val_score as cvs
scores = [cvs(m, trainvecs, labels, cv=5) for m in models]
In [ ]:
scores
In [ ]:
[a.mean() for a in scores]
In [ ]:
from quilt.data.akarve import pydata_book as pb
pb.titanic

Feature engineering

In [ ]:
train = pb.titanic.train()
test = pb.titanic.test()
In [ ]:
# check for nulls
train.isnull().sum()
In [ ]:
test.isnull().sum()
In [ ]:
test['IsMale'] = (test['Sex'] == 'male').astype(int)
train['IsMale'] = (train['Sex'] == 'male').astype(int)
test['NumRelatives'] = test['SibSp'] + test['Parch']
train['NumRelatives'] = train['SibSp'] + train['Parch']
features = ['Pclass', 'IsMale', 'Age', 'NumRelatives']
In [ ]:
# imputation
age_median = train['Age'].median()
age_mean = train['Age'].mean()
In [ ]:
age_mean
In [ ]:
train['AgeImputeMean'] = train['Age'].fillna(age_mean)
test['AgeImputeMean'] = test['Age'].fillna(age_mean)
In [ ]:
import pandas as pd
# select four features we care about
features = ['Pclass', 'IsMale', 'NumRelatives', 'AgeImputeMean']
# store updated data
pb._set(['titanic', 'features'], pd.DataFrame([features]))

Training

In [ ]:
features = pb.titanic.features()
train = pb.titanic.train()
trainsub = train[features.values[0]]
In [ ]:
trainvecs = trainsub.values
trainlabels = train['Survived'].values

train

In [ ]:
from sklearn.model_selection import cross_val_score as cvs
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=3, random_state=0)
scores = cvs(rfc, trainvecs, trainlabels, cv=5)
scores.mean()