import numpy as np
import pandas as pd
vstable = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/Video_Store_2.csv", index_col=0)
vstable.shape
vstable.head()
vs_records = vstable[['Gender','Income','Age','Rentals','Avg Per Visit','Genre']]
vs_records.head()
vs_target = vstable.Incidentals
vs_target.head()
vs_matrix = pd.get_dummies(vs_records[['Gender','Income','Age','Rentals','Avg Per Visit','Genre']])
vs_matrix.head(10)
from sklearn.model_selection import train_test_split
vs_train, vs_test, vs_target_train, vs_target_test = train_test_split(vs_matrix, vs_target, test_size=0.2, random_state=33)
print(vs_test.shape)
vs_test[0:5]
print(vs_train.shape)
vs_train[0:5]
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler().fit(vs_train)
vs_train_norm = min_max_scaler.transform(vs_train)
vs_train_norm = pd.DataFrame(vs_train_norm, columns=vs_train.columns, index=vs_train.index)
vs_test_norm = min_max_scaler.transform(vs_test)
vs_test_norm = pd.DataFrame(vs_test_norm, columns=vs_test.columns, index=vs_test.index)
# np.set_printoptions(precision=2, linewidth=80, suppress=True)
vs_train_norm.head()
vs_test_norm.head()
from sklearn import neighbors, tree, naive_bayes
n_neighbors = 5
knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
knnclf.fit(vs_train_norm, vs_target_train)
knnpreds_test = knnclf.predict(vs_test_norm)
print(knnpreds_test)
from sklearn.metrics import classification_report
print(classification_report(vs_target_test, knnpreds_test))
from sklearn.metrics import confusion_matrix
knncm = confusion_matrix(vs_target_test, knnpreds_test)
print(knncm)
print(knnclf.score(vs_test_norm, vs_target_test))
print(knnclf.score(vs_train_norm, vs_target_train))
treeclf = tree.DecisionTreeClassifier(criterion='entropy', min_samples_split=3)
treeclf = treeclf.fit(vs_train, vs_target_train)
treepreds_test = treeclf.predict(vs_test)
print(treepreds_test)
print(treeclf.score(vs_test, vs_target_test))
print(treeclf.score(vs_train, vs_target_train))
print(classification_report(vs_target_test, treepreds_test))
treecm = confusion_matrix(vs_target_test, treepreds_test)
print(treecm)
import pylab as plt
%matplotlib inline
plt.matshow(treecm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
nbclf = naive_bayes.GaussianNB()
nbclf = nbclf.fit(vs_train, vs_target_train)
nbpreds_test = nbclf.predict(vs_test)
print(nbpreds_test)
print(nbclf.score(vs_train, vs_target_train))
print(nbclf.score(vs_test, vs_target_test))
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
ldclf = LinearDiscriminantAnalysis()
ldclf = ldclf.fit(vs_train, vs_target_train)
ldpreds_test = ldclf.predict(vs_test)
print(ldpreds_test)
print(ldclf.score(vs_train, vs_target_train))
print(ldclf.score(vs_test, vs_target_test))
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(treeclf, vs_matrix, vs_target, cv=5)
print(cv_scores)
print("Overall Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
from sklearn.tree import export_graphviz
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
tree = export_graphviz(treeclf, out_file=None, feature_names=vs_train.columns, class_names=['No','Yes'])
graph = Source(tree)
display(SVG(graph.pipe(format='svg')))
tree = export_graphviz(treeclf,out_file='tree.dot', feature_names=vs_train.columns, class_names=['No','Yes'])
import graphviz
with open("tree.dot") as f:
dot_graph = f.read()
graphviz.Source(dot_graph, format="png")
system(dot -Tpng tree.dot -o dtree.png)
from IPython.display import Image
Image(filename='dtree.png', width=900)