Support Vector Machines are classifiers that can classify datasets by a introducing an optimal hyperplane between the multi-dimensional data points. An hyperplane is a multi-dimensional structure that extends a two-dimensional plane. If the datasets consists of two dimensional dataset, then an estimate line is fit that provides the best classification on the dataset. By "best classification", it is to be noted that a plane that not necessarily provides perfect classification of all points in the training dataset but fits a criterion such that the line is farthest from all points. You can see from the figure below that a hyperplane classifies the dataset as shown.
<img src="../images/SVM.png", style="width: 700px;">
We shall use a plot_learning_curve function from sklearn:
ref: http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html .We will generate some simple toy data using sklearn's make_classification
function.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
train_data = pd.read_csv("https://raw.githubusercontent.com/colaberry/data/master/Titanic/train_data.csv")
test_data = pd.read_csv("https://raw.githubusercontent.com/colaberry/data/master/Titanic/test_data.csv")
features = ['Pclass', 'Survived','Age_Imputed', 'SibSp', 'Parch', 'Fare', 'C', 'Q', 'female']
#Keeping relevant data for processing
train_data = train_data[features]
#Converting dataset into array for Cross validation
array = train_data.values
#Seperating target variable and indepentdent variables
X=np.delete(array, 1, axis=1)
Y=array[:,1]
#Setting the test size and train size
test_size = 0.20
seed = 7
Use model_selection.train_test_split(....) function
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
scoring = 'accuracy'
models=SVC()
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(models, X_train, Y_train, cv=kfold, scoring=scoring)
results=(cv_results)
accuracy_train = cv_results.mean()
print(accuracy_train)
0.71514084507
ref_tmp_var = False
import numpy as np
try:
ref_assert_var = False
if( accuracy_train >0.65 and accuracy_train <0.75):
ref_assert_var = True
else:
ref_assert_var = False
except Exception:
print('Please follow the instructions given and use the same variables provided in the instructions.')
else:
if ref_assert_var:
ref_tmp_var = True
else:
print('Please follow the instructions given and use the same variables provided in the instructions.')
assert ref_tmp_var
continue
We should now predict the model on our test data.
# Make predictions on test dataset
svm = SVC()
use svm.fit(..) to fit the model and then accuracy_score(..) to find the accuracy
svm.fit(X_train, Y_train)
predictions = svm.predict(X_test)
accuracy_test= accuracy_score(Y_test, predictions)
print(accuracy_test)
0.68156424581
ref_tmp_var = False
import numpy as np
try:
ref_assert_var = False
if( accuracy_test >0.5 and accuracy_test <0.75):
ref_assert_var = True
else:
ref_assert_var = False
except Exception:
print('Please follow the instructions given and use the same variables provided in the instructions.')
else:
if ref_assert_var:
ref_tmp_var = True
else:
print('Please follow the instructions given and use the same variables provided in the instructions.')
assert ref_tmp_var
continue