#!/usr/bin/env python # coding: utf-8 # # Data Analysis for Software Engineers # # ## Practical Assignment 3 # ## Linear classification # # **General Information** # # **Due date:** 11 March 2018, 23:59 # **Submission link:** [here](https://www.dropbox.com/request/7TLjmvTYqgdn5KYUDCLK) # # Add your name to this notebook's title # # # Take in to account that some tasks may not have rigorous and comprehensive solution. # Support your code with comments and illustation if needed. The more conclusions, derivations and explanations you provide - the better. # # In[ ]: import numpy as np import pandas as pd import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('ggplot') plt.rcParams['figure.figsize'] = (12,8) # ## (Stochastic) Gradient Descent # Consider the logistic regression method for binary classification. # # In this part you need to implement the following algorithms for log-loss optimization # * gradient descent (GD) # * either stochastic gradient descent (SGD) or mini-batch stochastic gradient descent (mini-batch SGD) # # In these three methods a gradient of a loss function $L(w) = \frac{1}{N}\sum_i^N l_i(w)$ is obtained differently: # # * In GD the whole training dataset is used: $\nabla_wL(w) = \nabla_w\frac{1}{N}\sum_i l_i(w)$ # # * In SGD only one random training object $i$ is used for gradient estimation: $\nabla_wL(w) \approx \nabla_w l_i(w)$. So one **epoch** in SGD proceeds as follows: # * Shuffle training dataset # * Iterate over dataset objects one by one perform gradient step # # * In mini-batch SGD gradient is estimated on small random subsample (mini-batch) of training dataset: $\nabla_wL(w) = \nabla_w\frac{1}{M}\sum_i^M l_i(w)$. So one **epoch** in mini-batch SGD proceeds as follows: # * Shuffle training dataset # * Split dataset into (almost-)equal mini-batches of small size M # * Iterate over batches and perform gradient steps # ## Sorting Hat in action #

# Put your last name in russian in the field below # In[ ]: USER_NAME = u"Иванов".lower() ALGORITHM = ["SGD", "Mini Batch SGD"] print "Implement GD and {}".format( ALGORITHM[hash(USER_NAME[::-1]) % 2] ) # ## Elastic Net Loss (1 point) # Consider logistic regression with $L_1$ and $L_2$ regulatizetion - elastic net. # # $$ # L(w) = \frac{1}{N} \sum_i^N \ln(1+\exp(-y_i(w^\top x_i+w_0)) + \gamma \|w\|_1 + \beta \|w\|_2 # $$ # # Find its derivative and update rules for gradient descent: # Your answer with LaTex: # In[ ]: # # Algorithm Implementation (3 points) # Some hints: # * Small random numbers from $[−1/2d,1/2d]$ should be chosen for weight initialization. # * The efficient step size for GD is approximately $0.01 − 1$. # * Step size should be constant for GD and decreasing for SGD, for example, $\alpha/\text{epoch_number}$ where $\alpha$ is some constant # * use [`sklearn.utils.shuffle`](http://scikit-learn.org/stable/modules/generated/sklearn.utils.shuffle.html) to shuffle `X` and `y` in a consistent way # * Stopping criteria: for GD use $|L_{old} − L_{new}| < tol$, for SGD simply do a particular number of iterations. # * For code efficiency use numpy vectors to compute gradients # # Additionally: # * Implement momentum update (bonus 1 point) # In[ ]: from sklearn.base import BaseEstimator, ClassifierMixin # In[ ]: # Class for simple gradient descent class MyVanillaGD(BaseEstimator, ClassifierMixin): def __init__(alpha=1.0, beta=1.0, gamma=1.0, tol=0.01, max_epoch=1000, random_state=123): self.alpha = alpha self.beta = beta self.gamma = gamma self.tol = tol self.max_epoch = max_epoch self.random_state = random_state # Fill in def fit(self, X, y): # Fill in def predict(self, X): # Fill in def predict_proba(self, X) # Fill in # Class for SGD or mini-batch SGD class MySGD(BaseEstimator, ClassifierMixin): def __init__(alpha=1.0, beta=1.0, gamma=1.0, max_epoch=1000, batch_size=128, random_state=123): self.alpha = alpha self.beta = beta self.gamma = gamma self.tol = tol self.max_epoch = max_epoch self.random_state = random_state # Fill in def fit(self, X, y): # Fill in def predict(self, X): # Fill in def predict_proba(self, X) # Fill in # # Checking on simple datasets (1 point) # Let's check your models on a toy dataset. Don’t forget to standardize the data and then add a constant feature to it. Use the same random state for GD and SGD # # Here you should demonstrate the following plots: # * plot with data points and decision boundary for each method, # * plot of decreasing $L(w)$ for increasing epoch number (for (mini-batch) SGD). # In[ ]: from sklearn.datasets import make_classification RND_SEED = 123 # In[ ]: def get_toy_dataset(): return make_classification(n_samples=1000, n_features=2, class_sep=0.5, n_clusters_per_class=1, n_redundant=0, shift=4, scale=2, random_state=RND_SEED) def plot_decision_boundary(model, X, y): fig = plt.figure() X1min, X2min = X.min(axis=0) X1max, X2max = X.max(axis=0) x1, x2 = np.meshgrid(np.linspace(X1min, X1max, 500), np.linspace(X2min, X2max, 500)) ypred = model.predict(np.c_[x1.ravel(), x2.ravel()]) ypred = ypred.reshape(x1.shape) plt.contourf(x1, x2, ypred, alpha=.4) plt.scatter(X[:,0], X[:,1], c=y) # In[ ]: # # Regulatization (2 points) # #### Task 1 # * Load the first dataset. Use `pickle.load`. Fit a logistic regression classifier on the training samples. Use GD with different regularizations (without one, only L1, only L2, L1 and L2), use the same random state for all runs. Don’t forget to standardize the data and then add a constant feature to it. # * Compare the results of the methods on the train and test data, explain the difference. Demonstrate regularization effect on coefficients. # # In[ ]: # #### Task 2 # * Use the resulting weights vector of GD with L1 regularization to determine two the most important features. Fit the logistic classifier only on these two features (+ the constant one) and visualize the decision boundary. Does L1 regularization help you to chose important features? # In[ ]: # # Real Dataset (3 points) # In this part of the task you will work with the problem of diabetes diagnostics. Load the diabetes dataset using pickle.load. # This dataset has the following features: # 1. Number of pregnancies # 2. Plasma glucose concentration after 2 hours in an oral glucose tolerance test # 3. Diastolic blood pressure # 4. Triceps skin fold thickness # 5. 2-Hour serum insulin # 6. Body mass index # 7. Diabetes pedigree function # 8. Age # Class label is equal to 1 if a person has a diabetes and to -1 otherwise. # #### Task 1 # # Train the logistic regression classifier on this dataset. Use mini-batch SGD without regularization. Don’t # forget to standardize the data and then add a constant feature to it. # In[ ]: # #### Task 2 # In diagnostic problems false positive and false negative errors actually have different costs. # # Let’s say, if we make false negative error (don’t detect a condition when it is present), then the patient doesn’t have a necessary treatment and, if we make false positive error (detect a condition when it isn’t present), then the patient simply need to be tested more. # # Therefore, the cost of false negative error is higher and we care much more about this type of error. Compute a confusion matrix for fitted classifier. # # How many errors of each type have you got? Compute a false positive and a false negative rates for this classifier. Why are they so different? # # Useful functions: `sklearn.metrics.confusion_matrix`. # In[ ]: # #### Task 3 # To change the proportion of errors of different types you can change a threshold a at the prediction # rule $y = \sigma(w^\top x + w_0) > a$, where $a \in [0, 1]$. # # Show the ROC-curve of the fitted classifier and a point on it which corresponds to $a = 0.5$ (the one you # computed at the previous step). # # Using ROC-curve choose a so that false negative rate is less than $20%$ while false positive rate is still small. What accuracy and false positive rate does the final algorithm have? # # Useful functions: `sklearn.metrics.roc_curve`. # In[ ]: