#!/usr/bin/env python # coding: utf-8 # #### In this example, we continue to drill a bit futher into the use of scikit-learn for classification, as well as the use of cross-validation for evaluation model performance. # In[1]: import numpy as np import pandas as pd # In[2]: vstable = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/Video_Store_2.csv", index_col=0) vstable.shape # In[3]: vstable.head() # #### Let's separate the target attribute and the attributes used for model training # In[4]: vs_records = vstable[['Gender','Income','Age','Rentals','Avg Per Visit','Genre']] vs_records.head() # In[5]: vs_target = vstable.Incidentals vs_target.head() # #### As before, we use Pandas "get_dummies" function to create dummy variables. # In[6]: vs_matrix = pd.get_dummies(vs_records[['Gender','Income','Age','Rentals','Avg Per Visit','Genre']]) vs_matrix.head(10) # #### Next, we divide the data into randomized training and test partitions (note that the same split should also be perfromed on the target attribute). The easiest way to do this is to use the "train_test_split" module of "sklearn.cross_validation". # In[7]: from sklearn.model_selection import train_test_split vs_train, vs_test, vs_target_train, vs_target_test = train_test_split(vs_matrix, vs_target, test_size=0.2, random_state=33) print(vs_test.shape) vs_test[0:5] # In[8]: print(vs_train.shape) vs_train[0:5] # #### Let's try KNN Classifier - Note that in this example we did not normalize the data. # In[9]: from sklearn import neighbors, tree, naive_bayes # #### First, we'll use KNN classifer. You can vary K and monitor the accuracy metrics (see below) to find the best value. # In[10]: n_neighbors = 5 knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance') knnclf.fit(vs_train, vs_target_train) # #### Next, we call the predict function on the test intances to produce the predicted classes. # In[11]: knnpreds_test = knnclf.predict(vs_test) # In[12]: print(knnpreds_test) # In[13]: from sklearn.metrics import classification_report # In[14]: print(classification_report(vs_target_test, knnpreds_test)) # In[15]: print(knnclf.score(vs_test, vs_target_test)) # In[16]: print(knnclf.score(vs_train, vs_target_train)) # #### You may notice that accuracy on test data is much lower than in part 1 of this example (previous notebook) when the data was normalized and rescaled. This may indicate that normalization in KNN is very important to improve performance and to avoid overfitting. # #### Next, let's use a decision tree classifier: # In[17]: treeclf = tree.DecisionTreeClassifier(criterion='entropy', min_samples_split=3) treeclf = treeclf.fit(vs_train, vs_target_train) # In[18]: print(treeclf.score(vs_test, vs_target_test)) # In[19]: print(treeclf.score(vs_train, vs_target_train)) # #### Now, let's try Gaussian and Multinomial Naive Bayes classifiers: # In[20]: nbclf = naive_bayes.GaussianNB() nbclf = nbclf.fit(vs_train, vs_target_train) print("Score on Training: ", nbclf.score(vs_train, vs_target_train)) print("Score on Test: ", nbclf.score(vs_test, vs_target_test)) # In[21]: nbmclf = naive_bayes.MultinomialNB() nbmclf = nbclf.fit(vs_train, vs_target_train) print("Score on Training: ", nbmclf.score(vs_train, vs_target_train)) print("Score on Test: ", nbmclf.score(vs_test, vs_target_test)) # #### Finally, let's try linear discriminant analysis: # In[23]: from sklearn.discriminant_analysis import LinearDiscriminantAnalysis ldclf = LinearDiscriminantAnalysis() ldclf = ldclf.fit(vs_train, vs_target_train) print("Score on Training: ", ldclf.score(vs_train, vs_target_train)) print("Score on Test: ", ldclf.score(vs_test, vs_target_test)) # #### Let's explore various decision tree parameters and also the use of cross-validation for evaluation: # In[24]: import graphviz # In[25]: from sklearn.tree import export_graphviz from sklearn.model_selection import cross_val_score # In[26]: treeclf = tree.DecisionTreeClassifier(criterion='entropy') # In[27]: cv_scores = cross_val_score(treeclf, vs_train, vs_target_train, cv=5) cv_scores # In[28]: print("Overall Accuracy on X-Val: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2)) # In[29]: treeclf = treeclf.fit(vs_train, vs_target_train) print("Accuracy on Training: ", treeclf.score(vs_train, vs_target_train)) # In[30]: export_graphviz(treeclf,out_file='tree.dot', feature_names=vs_train.columns, class_names=["No","Yes"]) with open("tree.dot") as f: dot_graph = f.read() graphviz.Source(dot_graph) # #### We can obtain summary results on how informative are each of the features in the data: # In[31]: print("Feature Importances:\n{}".format(treeclf.feature_importances_)) # In[32]: import pylab as plt get_ipython().run_line_magic('matplotlib', 'inline') def plot_feature_importances(model, n_features, feature_names): plt.barh(range(n_features), model.feature_importances_, align='center') plt.yticks(np.arange(n_features), feature_names) plt.xlabel("Feature importance") plt.ylabel("Feature") plt.ylim(-1, n_features) plot_feature_importances(treeclf, len(vs_matrix.columns), vs_matrix.columns) # #### The above evaluation results indicate overfitting. Pruning the tree may help in reducing overfitting. # In[33]: treeclf = tree.DecisionTreeClassifier(criterion='entropy', min_samples_leaf=3) cv_scores = cross_val_score(treeclf, vs_train, vs_target_train, cv=5) print(cv_scores) print("Overall Accuracy on X-Val: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2)) treeclf = treeclf.fit(vs_train, vs_target_train) print("Accuracy on Training: ", treeclf.score(vs_train, vs_target_train)) # In[34]: export_graphviz(treeclf,out_file='tree.dot', feature_names=vs_train.columns, class_names=["No","Yes"]) with open("tree.dot") as f: dot_graph = f.read() graphviz.Source(dot_graph) # In[36]: treeclf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4) cv_scores = cross_val_score(treeclf, vs_train, vs_target_train, cv=5) print(cv_scores) print("Overall Accuracy on X-Val: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2)) treeclf = treeclf.fit(vs_train, vs_target_train) print("Accuracy on Training: ", treeclf.score(vs_train, vs_target_train)) # In[39]: export_graphviz(treeclf,out_file='tree.dot', feature_names=vs_train.columns, class_names=["No","Yes"]) with open("tree.dot") as f: dot_graph = f.read() graphviz.Source(dot_graph) # In[37]: treeclf = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=3, max_depth=4) cv_scores = cross_val_score(treeclf, vs_train, vs_target_train, cv=5) print(cv_scores) print("Overall Accuracy on X-Val: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2)) treeclf = treeclf.fit(vs_train, vs_target_train) print("Accuracy on Training: ", treeclf.score(vs_train, vs_target_train)) # In[38]: export_graphviz(treeclf,out_file='tree.dot', feature_names=vs_train.columns, class_names=["No","Yes"], filled=True) with open("tree.dot") as f: dot_graph = f.read() graphviz.Source(dot_graph) # In[ ]: