%load_ext watermark %watermark -a 'Sebastian Raschka' -v -d -p numpy,scipy,matplotlib,scikit-learn import csv import urllib url = 'https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv' csv_cont = urllib.request.urlopen(url) csv_cont = csv_cont.read() #.decode('utf-8') # Optional: saving the data to your local drive with open('./wine_data.csv', 'wb') as out: out.write(csv_cont) import numpy as np # reading in all data into a NumPy array all_data = np.loadtxt(open("./wine_data.csv","r"), delimiter=",", skiprows=0, dtype=np.float64 ) # load class labels from column 1 y_wine = all_data[:,0] # conversion of the class labels to integer-type array y_wine = y_wine.astype(np.int64, copy=False) # load the 14 features X_wine = all_data[:,1:] # printing some general information about the data print('\ntotal number of samples (rows):', X_wine.shape[0]) print('total number of features (columns):', X_wine.shape[1]) # printing the 1st wine sample float_formatter = lambda x: '{:.2f}'.format(x) np.set_printoptions(formatter={'float_kind':float_formatter}) print('\n1st sample (i.e., 1st row):\nClass label: {:d}\n{:}\n' .format(int(y_wine[0]), X_wine[0])) # printing the rel.frequency of the class labels print('Class label frequencies') print('Class 1 samples: {:.2%}'.format(list(y_wine).count(1)/y_wine.shape[0])) print('Class 2 samples: {:.2%}'.format(list(y_wine).count(2)/y_wine.shape[0])) print('Class 3 samples: {:.2%}'.format(list(y_wine).count(3)/y_wine.shape[0])) %matplotlib inline from matplotlib import pyplot as plt from math import floor, ceil # for rounding up and down plt.figure(figsize=(10,8)) # bin width of the histogram in steps of 0.15 bins = np.arange(floor(min(X_wine[:,0])), ceil(max(X_wine[:,0])), 0.15) # get the max count for a particular bin for all classes combined max_bin = max(np.histogram(X_wine[:,0], bins=bins)[0]) # the order of the colors for each histogram colors = ('blue', 'red', 'green') for label,color in zip( range(1,4), colors): mean = np.mean(X_wine[:,0][y_wine == label]) # class sample mean stdev = np.std(X_wine[:,0][y_wine == label]) # class standard deviation plt.hist(X_wine[:,0][y_wine == label], bins=bins, alpha=0.3, # opacity level label='class {} ($\mu={:.2f}$, $\sigma={:.2f}$)'.format(label, mean, stdev), color=color) plt.ylim([0, max_bin*1.3]) plt.title('Wine data set - Distribution of alocohol contents') plt.xlabel('alcohol by volume', fontsize=14) plt.ylabel('count', fontsize=14) plt.legend(loc='upper right') plt.show() from scipy.stats import pearsonr plt.figure(figsize=(10,8)) for label,marker,color in zip( range(1,4),('x', 'o', '^'),('blue', 'red', 'green')): # Calculate Pearson correlation coefficient R = pearsonr(X_wine[:,0][y_wine == label], X_wine[:,1][y_wine == label]) plt.scatter(x=X_wine[:,0][y_wine == label], # x-axis: feat. from col. 1 y=X_wine[:,1][y_wine == label], # y-axis: feat. from col. 2 marker=marker, # data point symbol for the scatter plot color=color, alpha=0.7, label='class {:}, R={:.2f}'.format(label, R[0]) # label for the legend ) plt.title('Wine Dataset') plt.xlabel('alcohol by volume in percent') plt.ylabel('malic acid in g/l') plt.legend(loc='upper right') plt.show() from mpl_toolkits.mplot3d import Axes3D fig = plt.figure(figsize=(8,8)) ax = fig.add_subplot(111, projection='3d') for label,marker,color in zip( range(1,4),('x', 'o', '^'),('blue','red','green')): ax.scatter(X_wine[:,0][y_wine == label], X_wine[:,1][y_wine == label], X_wine[:,2][y_wine == label], marker=marker, color=color, s=40, alpha=0.7, label='class {}'.format(label)) ax.set_xlabel('alcohol by volume in percent') ax.set_ylabel('malic acid in g/l') ax.set_zlabel('ash content in g/l') plt.title('Wine dataset') plt.show() from sklearn.cross_validation import train_test_split from sklearn import preprocessing X_train, X_test, y_train, y_test = train_test_split(X_wine, y_wine, test_size=0.30, random_state=123) print('Class label frequencies') print('\nTraining Dataset:') for l in range(1,4): print('Class {:} samples: {:.2%}'.format(l, list(y_train).count(l)/y_train.shape[0])) print('\nTest Dataset:') for l in range(1,4): print('Class {:} samples: {:.2%}'.format(l, list(y_test).count(l)/y_test.shape[0])) std_scale = preprocessing.StandardScaler().fit(X_train) X_train = std_scale.transform(X_train) X_test = std_scale.transform(X_test) f, ax = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(10,5)) for a,x_dat, y_lab in zip(ax, (X_train, X_test), (y_train, y_test)): for label,marker,color in zip( range(1,4),('x', 'o', '^'),('blue','red','green')): a.scatter(x=x_dat[:,0][y_lab == label], y=x_dat[:,1][y_lab == label], marker=marker, color=color, alpha=0.7, label='class {}'.format(label) ) a.legend(loc='upper left') ax[0].set_title('Training Dataset') ax[1].set_title('Test Dataset') f.text(0.5, 0.04, 'malic acid (standardized)', ha='center', va='center') f.text(0.08, 0.5, 'alcohol (standardized)', ha='center', va='center', rotation='vertical') plt.show() minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit(X_train) X_train_minmax = minmax_scale.transform(X_train) X_test_minmax = minmax_scale.transform(X_test) f, ax = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(10,5)) for a,x_dat, y_lab in zip(ax, (X_train_minmax, X_test_minmax), (y_train, y_test)): for label,marker,color in zip( range(1,4),('x', 'o', '^'),('blue','red','green')): a.scatter(x=x_dat[:,0][y_lab == label], y=x_dat[:,1][y_lab == label], marker=marker, color=color, alpha=0.7, label='class {}'.format(label) ) a.legend(loc='upper left') ax[0].set_title('Training Dataset') ax[1].set_title('Test Dataset') f.text(0.5, 0.04, 'malic acid (normalized)', ha='center', va='center') f.text(0.08, 0.5, 'alcohol (normalized)', ha='center', va='center', rotation='vertical') plt.show() from sklearn.decomposition import PCA sklearn_pca = PCA(n_components=2) # number of components to keep sklearn_transf = sklearn_pca.fit_transform(X_train) plt.figure(figsize=(10,8)) for label,marker,color in zip( range(1,4),('x', 'o', '^'),('blue', 'red', 'green')): plt.scatter(x=sklearn_transf[:,0][y_train == label], y=sklearn_transf[:,1][y_train == label], marker=marker, color=color, alpha=0.7, label='class {}'.format(label) ) plt.xlabel('vector 1') plt.ylabel('vector 2') plt.legend() plt.title('Most significant singular vectors after linear transformation via PCA') plt.show() sklearn_pca = PCA(n_components=None) sklearn_transf = sklearn_pca.fit_transform(X_train) sklearn_pca.explained_variance_ratio_ from sklearn.lda import LDA sklearn_lda = LDA(n_components=2) transf_lda = sklearn_lda.fit_transform(X_train, y_train) plt.figure(figsize=(10,8)) for label,marker,color in zip( range(1,4),('x', 'o', '^'),('blue', 'red', 'green')): plt.scatter(x=transf_lda[:,0][y_train == label], y=transf_lda[:,1][y_train == label], marker=marker, color=color, alpha=0.7, label='class {}'.format(label) ) plt.xlabel('vector 1') plt.ylabel('vector 2') plt.legend() plt.title('Most significant singular vectors after linear transformation via LDA') plt.show() # fit model lda_clf = LDA() lda_clf.fit(X_train, y_train) LDA(n_components=None, priors=None) # prediction print('1st sample from test dataset classified as:', lda_clf.predict(X_test[0,:])) print('actual class label:', y_test[0]) from sklearn import metrics pred_train_lda = lda_clf.predict(X_train) print('Prediction accuracy for the training dataset') print('{:.2%}'.format(metrics.accuracy_score(y_train, pred_train_lda))) pred_test_lda = lda_clf.predict(X_test) print('Prediction accuracy for the test dataset') print('{:.2%}'.format(metrics.accuracy_score(y_test, pred_test_lda))) print('Confusion Matrix of the LDA-classifier') print(metrics.confusion_matrix(y_test, lda_clf.predict(X_test))) from sklearn.linear_model import SGDClassifier sgd_clf = SGDClassifier() sgd_clf.fit(X_train, y_train) pred_train_sgd = sgd_clf.predict(X_train) pred_test_sgd = sgd_clf.predict(X_test) print('\nPrediction accuracy for the training dataset') print('{:.2%}\n'.format(metrics.accuracy_score(y_train, pred_train_sgd))) print('Prediction accuracy for the test dataset') print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_sgd))) print('Confusion Matrix of the SGD-classifier') print(metrics.confusion_matrix(y_test, sgd_clf.predict(X_test))) sgd_clf2 = SGDClassifier() sgd_clf2.fit(X_train[:, :2], y_train) x_min = X_test[:, 0].min() x_max = X_test[:, 0].max() y_min = X_test[:, 1].min() y_max = X_test[:, 1].max() step = 0.01 X, Y = np.meshgrid(np.arange(x_min, x_max, step), np.arange(y_min, y_max, step)) Z = sgd_clf2.predict(np.c_[X.ravel(), Y.ravel()]) Z = Z.reshape(X.shape) # Plots decision regions plt.contourf(X, Y, Z) # Plots samples from training data set plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train) plt.show() # export objects via pickle import pickle pickle_out = open('standardized_data.pkl', 'wb') pickle.dump([X_train, X_test, y_train, y_test], pickle_out) pickle_out.close() pickle_out = open('classifiers.pkl', 'wb') pickle.dump([lda_clf, sgd_clf], pickle_out) pickle_out.close() # import objects via pickle my_object_file = open('standardized_data.pkl', 'rb') X_train, X_test, y_train, y_test = pickle.load(my_object_file) my_object_file.close() my_object_file = open('classifiers.pkl', 'rb') lda_clf, sgd_clf = pickle.load(my_object_file) my_object_file.close() print('Confusion Matrix of the SGD-classifier') print(metrics.confusion_matrix(y_test, sgd_clf.predict(X_test))) training_data = np.hstack((y_train.reshape(y_train.shape[0], 1), X_train)) test_data = np.hstack((y_test.reshape(y_test.shape[0], 1), X_test)) np.savetxt('./training_set.csv', training_data, delimiter=',') np.savetxt('./test_set.csv', test_data, delimiter=',')