#!/usr/bin/env python # coding: utf-8 # In[4]: # %load /Users/facai/Study/book_notes/preconfig.py get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes=True) #sns.set(font='SimHei') plt.rcParams['axes.grid'] = False #from IPython.display import SVG def show_image(filename, figsize=None, res_dir=True): if figsize: plt.figure(figsize=figsize) if res_dir: filename = './res/{}'.format(filename) plt.imshow(plt.imread(filename)) # Chapter 7 Regularization for Deep Learning # ========================================== # the best fitting model is a large model that has been regularized appropriately. # # ### 7.1 Parameter Norm Penalties # # \begin{equation} # \tilde{J}(\theta; X, y) = J(\theta; X, y) + \alpha \Omega(\theta) # \end{equation} # # where $\Omega(\theta)$ is a paramter norm penalty. # # typically, penalizes **only the weights** of the affine transformation at each layer and leaves the biases unregularized. # # # #### 7.1.1 $L^2$ Parameter Regularization # # # #### 7.1.2 $L^1$ Regularization # # The sparsity property induced by $L^1$ regularization => feature selection # ### 7.2 Norm Penalties as Constrained Optimization # # constrain $\Omega(\theta)$ to be less than some constant $k$: # # \begin{equation} # \mathcal{L}(\theta, \alpha; X, y) = J(\theta; X, y) + \alpha(\Omega(\theta) - k) # \end{equation} # # In practice, column norm limitation is always implemented as an explicit constraint with reprojection. # ### 7.3 Regularization and Under-Constrained Problems # # regularized matrix is guarantedd to be invertible. # # # ### 7.4 Dataset Augmentation # # create fake data: # # + transform # + inject noise # # # ### 7.5 Noise Robustness # # + add noise to data # + add noise to weight (Bayesian: variable distributaion): # is equivalent with an additional regularization term. # + add noise to output target: label smooothing # # # ### 7.6 Semi-Supervised Learning # # Goal: learn a representation so that example from the same class have similar representations. # # # ### 7.7 Multi-Task Learning # # 1. Task-specific paramters # 2. Generic parameters # In[5]: show_image("fig7_2.png") # ### 7.8 Early Stopping # # run it until the ValidationSetError has not imporved for some amount of time. # # Use the parameters of the lowest ValidationSetError during the whole train. # In[6]: show_image("fig7_3.png", figsize=[10, 8]) # ### 7.9 Parameter Tying adn Parameter Sharing # # + regularized the paramters of one model (supervised) to be close to model (unsupervised) # + to force sets of parameters to be equal: parameter sharing => convolutional neural networks. # ### 7.10 Sparse Representations # # place a penalty on the activations of the units in a neural network, encouraging their activations to be sparse. # # # ### 7.11 Bagging and Other Ensemble Methods # # ### 7.12 Dropout # # increase the size of the model when using dropout. # # small samples, dropout is less effective. # # # ### 7.13 Adversarial Training # In[8]: show_image("fig7_8.png", figsize=[10, 8]) # ### 7.14 Tangent Distance, Tangent Prop, and Manifold Tangent Classifier # In[ ]: