#!/usr/bin/env python # coding: utf-8 # # MDI 720 : Statistiques # ## GLM # ### *Joseph Salmon* # # This notebook reproduces the pictures for the course "GLM_fr" # In[ ]: import numpy as np import seaborn as sns import matplotlib.pyplot as plt # for plots from matplotlib import rc from sklearn.linear_model import RidgeCV import seaborn as sns from os import mkdir, path from mpl_toolkits.mplot3d import Axes3D # interaction mode better for 3D from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.preprocessing import OneHotEncoder from sklearn.metrics import accuracy_score from sklearn import datasets from functions_classif import class_round, point_plot,\ frontiere, data_blob_generation, pdf_3D_plot get_ipython().run_line_magic('matplotlib', 'notebook') # In[ ]: dirname = "../prebuiltimages/" if not path.exists(dirname): mkdir(dirname) np.random.seed(seed=44) ############################################################################### # Plot initialization plt.close('all') dirname = "../srcimages/" imageformat = '.pdf' rc('font', **{'family': 'sans-serif', 'sans-serif': ['Computer Modern Roman']}) params = {'axes.labelsize': 12, 'font.size': 16, 'legend.fontsize': 16, 'text.usetex': True, 'figure.figsize': (8, 6)} plt.rcParams.update(params) sns.set_context("poster") sns.set_palette("colorblind") sns.set_style("white") sns.axes_style() ############################################################################### # display function: saving = False def my_saving_display(fig, dirname, filename, imageformat): """"Saving with personal function.""" filename = filename.replace('.', 'pt') # remove "." to avoid floats issues if saving is True: dirname + filename + imageformat image_name = dirname + filename + imageformat fig.savefig(image_name) # In[ ]: digits = datasets.load_digits() images_and_labels = list(zip(digits.images, digits.target)) fig = plt.figure(figsize=(12, 5)) for index, (image, label) in enumerate(images_and_labels[:10]): plt.subplot(2, 5, index + 1) plt.axis('off') plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') plt.title('Training: %i' % label) plt.show() my_saving_display(fig, dirname, "digits_plot", imageformat) # In[ ]: ############################################################################## # Limits on a simulated example ############################################################################## scenario = 4 n_samples = 100 n_test = 30 if scenario == 3: # Example with 3 classes mean_0 = [0, 0] mean_1 = [1, 0] mean_2 = [0, 1] means = [mean_0, mean_1, mean_2] noise_level = 0.15 elif scenario == 4: # Example with 4 classes mean_0 = [0, 0] mean_1 = [1, 0] mean_2 = [0, 1] mean_3 = [0.5, 0.5] means = [mean_0, mean_1, mean_2, mean_3] noise_level = 0.10 elif scenario == 2: # Example with 4 classes mean_0 = [0, 0] mean_1 = [1, 1] noise_level = 0.10 means = [mean_0, mean_1] n_class = len(means) # get the number of classes considered X, y = data_blob_generation(n_samples, means, noise_level) X_test, y_test = data_blob_generation(n_test, means, noise_level) # In[ ]: ############################################################################## # Naive linear regression on raw observations ############################################################################## resolution_param = 500 # 500 for nice plotting, 50 for fast version regr = LinearRegression() regr.fit(X, y) y_pred_test = class_round(regr.predict(X_test), n_class) # Plotting part fig0 = plt.figure(figsize=(12, 8)) title = "Left out accuracy (regression w./o. dummy variables)" + \ ": {:.2f}".format(accuracy_score(y_test, y_pred_test)) plt.title(title) def f(xx): """Classifier""" return class_round(regr.predict(xx.reshape(1, -1)), n_class) frontiere(f, X, y, step=resolution_param) # reshape needed for sklearn convention on point to predict evaluations display1 = np.asarray([0, 0]).reshape(1, -1) display2 = np.asarray([0.5, 0.5]).reshape(1, -1) display3 = np.asarray([1, 1]).reshape(1, -1) point_plot(fig0, display1, np.zeros(n_class), (90, -60), color_txt='black') point_plot(fig0, display2, np.zeros(n_class), (-200, -15), color_txt='black') point_plot(fig0, display3, np.zeros(n_class), (-100, -60), color_txt='black') plt.show() my_saving_display(fig0, dirname, 'RawRegression' + 'n_class' + str(n_class), imageformat) # In[ ]: ############################################################################## # Naive linear regression on dummy variables ############################################################################## # performing class estimation by performing multi-task learning with dummy # variables enc = OneHotEncoder() enc.fit(y.reshape(-1, 1)) Y = enc.transform(y.reshape(-1, 1)).toarray() regr_multi = LinearRegression() regr_multi.fit(X, Y) proba_vector_test = regr_multi.predict(X_test) y_pred_test = class_round(regr.predict(X_test), n_class) # performance evaluation on new dataset y_pred_test = np.argmax(proba_vector_test, axis=1) title = "Left out accuracy (regression w. dummy variables)" + \ ": {:.2f}".format(accuracy_score(y_test, y_pred_test)) # Plotting part fig1 = plt.figure(figsize=(12, 8)) plt.title(title) def f(xx): """Classifier""" return np.argmax(regr_multi.predict(xx.reshape(1, -1))) frontiere(f, X, y, step=resolution_param) # naive method: estimate probability with linear model over indicators proba_vector1 = regr_multi.predict(display1) label_pred1 = np.argmax(proba_vector1) proba_vector2 = regr_multi.predict(display2) label_pred2 = np.argmax(proba_vector2) proba_vector3 = regr_multi.predict(display3) label_pred3 = np.argmax(proba_vector3) point_plot(fig0, display1, proba_vector1[0], (90, -60), color_txt='black') point_plot(fig0, display2, proba_vector2[0], (-200, -15), color_txt='black') point_plot(fig0, display3, proba_vector3[0], (-100, -60), color_txt='black') plt.show() my_saving_display(fig1, dirname, 'DummyRegression' + 'n_class' + str(n_class), imageformat) # In[ ]: ############################################################################## # Logistic regression ############################################################################## clf = LogisticRegression() clf.fit(X, y) y_logit_test = clf.predict(X_test) title = "Left out accuracy (logistic regression) " + \ ": {:.2f}".format(accuracy_score(y_test, y_logit_test)) fig2 = plt.figure(figsize=(12, 8)) plt.title(title) def f(xx): """Classifier""" return int(clf.predict(xx.reshape(1, -1))) frontiere(f, X, y, step=resolution_param) proba_vector_logit1 = np.exp(clf.predict_log_proba(display1))[0] proba_vector_logit2 = np.exp(clf.predict_log_proba(display2))[0] proba_vector_logit3 = np.exp(clf.predict_log_proba(display3))[0] point_plot(fig0, display1, proba_vector_logit1, (90, -60), color_txt='black') point_plot(fig0, display2, proba_vector_logit2, (-200, -15), color_txt='black') point_plot(fig0, display3, proba_vector_logit3, (-100, -60), color_txt='black') plt.show() my_saving_display(fig2, dirname, 'LogisticRegression' + 'n_class' + str(n_class), imageformat) # In[ ]: ############################################################################## # Plotting the underlying distribution: mixture of isotropic Gaussian ############################################################################## step = 250 fig3 = pdf_3D_plot(means, n_class, noise_level, step) my_saving_display(fig2, dirname, 'mixt_of_iso_gaussian' + 'n_class' + str(n_class), imageformat) # In[ ]: ############################################################################## # Plotting the logistic function ############################################################################## t = np.arange(-10, 10, step=0.01) def logistic(t): """logistic""" return np.exp(t) / (1 + np.exp(t)) fig0 = plt.figure(figsize=(8, 8)) ax1 = plt.subplot(111) ax1.plot(t, logistic(t), label='logistique') ax1.set_ylim(-0.1, 1.1) ax1.set_xlim(-10, 10) plt.legend(loc="upper left", fontsize=14) plt.show() my_saving_display(fig2, dirname, 'logistic', imageformat) # In[ ]: