import quilt
from quilt.data.uciml import iris
iris
train = iris.tables.bezdek_iris()
trainvecs = train.values[:,:4]
labels = train['class'].values
from sklearn import svm
C = 1.0 # magic regularization parameter
models = (svm.SVC(kernel='linear', C=C),
svm.LinearSVC(C=C),
svm.SVC(kernel='rbf', gamma=0.7, C=C),
svm.SVC(kernel='poly', degree=3, C=C))
models = (clf.fit(trainvecs, labels) for clf in models)
from sklearn.model_selection import cross_val_score as cvs
scores = [cvs(m, trainvecs, labels, cv=5) for m in models]
scores
[a.mean() for a in scores]
from quilt.data.akarve import pydata_book as pb
pb.titanic
train = pb.titanic.train()
test = pb.titanic.test()
# check for nulls
train.isnull().sum()
test.isnull().sum()
test['IsMale'] = (test['Sex'] == 'male').astype(int)
train['IsMale'] = (train['Sex'] == 'male').astype(int)
test['NumRelatives'] = test['SibSp'] + test['Parch']
train['NumRelatives'] = train['SibSp'] + train['Parch']
features = ['Pclass', 'IsMale', 'Age', 'NumRelatives']
# imputation
age_median = train['Age'].median()
age_mean = train['Age'].mean()
age_mean
train['AgeImputeMean'] = train['Age'].fillna(age_mean)
test['AgeImputeMean'] = test['Age'].fillna(age_mean)
import pandas as pd
# select four features we care about
features = ['Pclass', 'IsMale', 'NumRelatives', 'AgeImputeMean']
# store updated data
pb._set(['titanic', 'features'], pd.DataFrame([features]))
features = pb.titanic.features()
train = pb.titanic.train()
trainsub = train[features.values[0]]
trainvecs = trainsub.values
trainlabels = train['Survived'].values
from sklearn.model_selection import cross_val_score as cvs
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=3, random_state=0)
scores = cvs(rfc, trainvecs, trainlabels, cv=5)
scores.mean()