#!/usr/bin/env python # coding: utf-8 # # Data Driven Modeling #
# ### PhD seminar series at Chair for Computer Aided Architectural Design (CAAD), ETH Zurich # # # [Vahid Moosavi](https://vahidmoosavi.com/) #
# # # # Tenth Session #
# 06 December 2016 # # # Deep Networks # ### Topics to be discussed # * **AutoEncoders** # * ** Deep AutoEncoders** # * **Representation Learning** # * **Distributed Representation** # * **Data Compression** # In[80]: import warnings warnings.filterwarnings("ignore") import pandas as pd import numpy as np from matplotlib import pyplot as plt pd.__version__ import sys from scipy import stats import time from scipy.linalg import norm import sompylib.sompy as SOM # we need to install tensor flow import tensorflow as tf import tensorflow.examples.tutorials.mnist.input_data as input_data get_ipython().run_line_magic('matplotlib', 'inline') # ## Review to PCA from the point of view of encoding/decoding (reconstruction) # # ### 1: Encoding: X_trans = X.dot(PC) # * **X is (Nxd) matrix, PC is a (dxd) matrix ---- > (Nxd) dot (dxd) --- > X_trans is (Nxd)** # # ### 2: Decoding: X_recon = Xtrans.dot(PCs.T) # * **X_trans is (Nxd), PC.T (dxd) --- > X_recon is (Nxd)** # # # ## Encoding/Decoding with no compression # ![](Images/PCA_Full.png) # # ## Encoding/Decoding with compression # ![](Images/PCA_lower.png) # # ### With this compression or dimensionality reduced encoding: # * **We reduce the required memory** # * ** But we loose information** # ## Some experiments with MNIST data set and PCA # In[82]: """Test the autoencoder using MNIST.""" # %% # load MNIST as before mnist = input_data.read_data_sets('MNIST_data', one_hot=True) mean_img = np.mean(mnist.train.images, axis=0) # In[63]: import random test_xs = mnist.test.images test_xs_labels= mnist.test.labels test_xs_norm = np.array([img - mean_img for img in test_xs]) import random ind_row_test = random.sample(range(test_xs.shape[0]),500) # In[31]: from sklearn.decomposition import PCA train_xs = mnist.train.images train_xs_norm = np.array([img - mean_img for img in train_xs]) # test_xs_labels= mnist.test.labels pca = PCA() pca.fit(train_xs_norm) # In[83]: W_pca = pca.components_ W_pca.shape # ## No dimensionality reduction # ### No reconstruction error # In[84]: W_pca = pca.components_ sel_comp = W_pca.shape[1] lowdim_PCA = test_xs_norm.dot(W_pca[:,:sel_comp]) lowdim_PCA.shape recon_PCA = lowdim_PCA.dot(W_pca[:,:sel_comp].T)+ mean_img n_examples = 10 fig, axs = plt.subplots(2, n_examples, figsize=(10, 2)) for example_i,ind in enumerate(ind_row_test[:n_examples]): axs[0][example_i].imshow( np.reshape(test_xs[ind, :], (28, 28))) axs[0][example_i].set_axis_off() axs[1][example_i].imshow( np.reshape([recon_PCA[ind, :]], (28, 28))) axs[1][example_i].set_axis_off() fig.show() plt.tight_layout() # ## With dimensionality reduction # ### Reconstruction error # In[44]: W_pca = pca.components_ sel_comp = 400 lowdim_PCA = test_xs_norm.dot(W_pca[:,:sel_comp]) lowdim_PCA.shape recon_PCA = lowdim_PCA.dot(W_pca[:,:sel_comp].T)+ mean_img n_examples = 10 fig, axs = plt.subplots(2, n_examples, figsize=(10, 2)) for example_i,ind in enumerate(ind_row_test[:n_examples]): axs[0][example_i].imshow( np.reshape(test_xs[ind, :], (28, 28))) axs[0][example_i].set_axis_off() axs[1][example_i].imshow( np.reshape([recon_PCA[ind, :]], (28, 28))) axs[1][example_i].set_axis_off() fig.show() plt.tight_layout() # ## With more dimensionality reduction # In[46]: W_pca = pca.components_ sel_comp = 2 lowdim_PCA = test_xs_norm.dot(W_pca[:,:sel_comp]) lowdim_PCA.shape recon_PCA = lowdim_PCA.dot(W_pca[:,:sel_comp].T)+ mean_img n_examples = 10 fig, axs = plt.subplots(2, n_examples, figsize=(10, 2)) for example_i,ind in enumerate(ind_row_test[:n_examples]): axs[0][example_i].imshow( np.reshape(test_xs[ind, :], (28, 28))) axs[0][example_i].set_axis_off() axs[1][example_i].imshow( np.reshape([recon_PCA[ind, :]], (28, 28))) axs[1][example_i].set_axis_off() fig.show() plt.tight_layout() # ### Auto Encoders: Supervised kind of PCA reconstruction # ![](Images/AutoEncoder.png) # # # ![](https://wikimedia.org/api/rest_v1/media/math/render/svg/4e3bf605748634864343e8c5ffa335f301cbf955) # # ![](https://wikimedia.org/api/rest_v1/media/math/render/svg/7fa0e56273a1cb32709b442e2421e9f947522b84) # # # **elementwise activation function makes layers differentiable: https://en.wikipedia.org/wiki/Activation_function** # # ![](https://wikimedia.org/api/rest_v1/media/math/render/svg/370f7b6aee47860a36b808c19582c0caaeb9c3a4) # # # # **Loss function** # ![](https://wikimedia.org/api/rest_v1/media/math/render/svg/cafbacd34534f23e90f73c0aa39fda11e29696ab) # # **Backpropogarion algorithm and Stochastic gradient descent updates all the parameters after each training data** # # # Deep Auto-Encoders # #### Hinton 2006 # https://www.cs.toronto.edu/~hinton/science.pdf # **"It has been obvious since the 1980s that # backpropagation through deep autoencoders # would be very effective for nonlinear dimensionality # reduction, provided that computers # were fast enough, data sets were big enough, # and the initial weights were close enough to a # good solution. All three conditions are now # satisfied"** # * ** The IDEA: we take the output of first PCA and train another PCA until the last encoder layer** # ![](Images/DeepAutoEncoder.png) # # # # Professional implementations: # ## Python libraries # * **TensorFlow** # * **Theano** # * **Keras on top of two previous ones** # * **Lasagna on top of Theano** # ## Other languages # ## Torch in Lua # ## Cafe in C++ # ## ... # # Tensor Flow # * https://github.com/tensorflow # ### Introduced by Google in November 2015 # ## Deep Autoencoders in Tensorflow # In[49]: """Tutorial on how to create an autoencoder w/ Tensorflow. Parag K. Mital, Jan 2016 """ # %% Imports import tensorflow as tf import numpy as np import math # %% Autoencoder definition def autoencoder(dimensions=[784, 512, 256, 64]): """Build a deep autoencoder w/ tied weights. Parameters ---------- dimensions : list, optional The number of neurons for each layer of the autoencoder. Returns ------- x : Tensor Input placeholder to the network z : Tensor Inner-most latent representation y : Tensor Output reconstruction of the input cost : Tensor Overall cost to use for training """ # %% input to the network x = tf.placeholder(tf.float32, [None, dimensions[0]], name='x') current_input = x # %% Build the encoder encoder = [] for layer_i, n_output in enumerate(dimensions[1:]): n_input = int(current_input.get_shape()[1]) W = tf.Variable( tf.random_uniform([n_input, n_output], -1.0 / math.sqrt(n_input), 1.0 / math.sqrt(n_input))) b = tf.Variable(tf.zeros([n_output])) encoder.append(W) # output = tf.nn.sigmoid(tf.matmul(current_input, W) + b) output = tf.nn.tanh(tf.matmul(current_input, W) + b) current_input = output # %% latent representation z = current_input encoder.reverse() # %% Build the decoder using the same weights for layer_i, n_output in enumerate(dimensions[:-1][::-1]): W = tf.transpose(encoder[layer_i]) b = tf.Variable(tf.zeros([n_output])) output = tf.nn.tanh(tf.matmul(current_input, W) + b) current_input = output # %% now have the reconstruction through the network y = current_input # %% cost function measures pixel-wise difference cost = tf.reduce_sum(tf.square(y - x)) return {'x': x, 'z': z, 'y': y, 'cost': cost} # In[50]: ae = autoencoder(dimensions=[784, 256, 64,2]) # ae = autoencoder(dimensions=[784,1000,500,250,2]) # %% learning_rate = 0.001 optimizer = tf.train.AdamOptimizer(learning_rate).minimize(ae['cost']) # %% # We create a session to use the graph sess = tf.Session() sess.run(tf.initialize_all_variables()) # %% # Fit all training data batch_size = 50 n_epochs = 15 for epoch_i in range(n_epochs): for batch_i in range(mnist.train.num_examples // batch_size): batch_xs, _ = mnist.train.next_batch(batch_size) train = np.array([img - mean_img for img in batch_xs]) sess.run(optimizer, feed_dict={ae['x']: train}) print(epoch_i, sess.run(ae['cost'], feed_dict={ae['x']: train})) # %% # Plot example reconstructions n_examples = 15 # test_xs, _ = mnist.test.next_batch(n_examples) # test_xs_norm = np.array([img - mean_img for img in test_xs]) # recon = sess.run(ae['y'], feed_dict={ae['x']: test_xs_norm}) # fig, axs = plt.subplots(2, n_examples, figsize=(10, 2)) # for example_i in range(n_examples): # axs[0][example_i].imshow( # np.reshape(test_xs[example_i, :], (28, 28))) # axs[0][example_i].set_axis_off() # axs[1][example_i].imshow( # np.reshape([recon[example_i, :] + mean_img], (28, 28))) # axs[1][example_i].set_axis_off() # fig.show() # plt.draw() # # plt.waitforbuttonpress() # In[64]: recon_AE = sess.run(ae['y'], feed_dict={ae['x']: test_xs_norm})+ mean_img lowdim_AE = sess.run(ae['z'], feed_dict={ae['x']: test_xs_norm}) # ## Comparing the results with SOM and PCA # In[52]: Data_tr = train_xs_norm + 1e-32*np.random.randn(train_xs_norm.shape[0],train_xs_norm.shape[1]) somMNIST = SOM.SOM('som1', Data_tr, mapsize = [60, 60],norm_method = 'var',initmethod='pca') # som1 = SOM.SOM('som1', D, mapsize = [1, 100],norm_method = 'var',initmethod='pca') somMNIST.train(n_job = 1, shared_memory = 'no',verbose='final') codebook_MNIST = somMNIST.codebook[:] # In[56]: codebook_MNIST_n = SOM.denormalize_by(somMNIST.data_raw, codebook_MNIST, n_method = 'var') + mean_img # In[65]: bmu_test = somMNIST.project_data(test_xs_norm) # In[66]: recon_SOM = codebook_MNIST_n[bmu_test] lowdim_SOM = somMNIST.ind_to_xy(bmu_test) # In[67]: lowdim_SOM.shape # In[ ]: # In[68]: fig = plt.figure() K = 9 c = np.argmax(test_xs_labels[ind_row_test],axis=1) plt.subplot(2,2,1) plt.scatter(lowdim_AE[ind_row_test,0],lowdim_AE[ind_row_test,1],s=50,edgecolor='None',marker='o',alpha=1.,c=plt.cm.RdYlBu_r(np.asarray(c)/float(K))); plt.subplot(2,2,2) plt.scatter(lowdim_PCA[ind_row_test,0],lowdim_PCA[ind_row_test,1],s=50,edgecolor='None',marker='o',alpha=1.,c=plt.cm.RdYlBu_r(np.asarray(c)/float(K))); plt.subplot(2,2,3) plt.scatter(lowdim_SOM[ind_row_test,0],lowdim_SOM[ind_row_test,1],s=50,edgecolor='None',marker='o',alpha=1.,c=plt.cm.RdYlBu_r(np.asarray(c)/float(K))); fig.set_size_inches(7,7); # In[69]: fig, axs = plt.subplots(4, n_examples, figsize=(10, 2)) for example_i,ind in enumerate(ind_row_test[:n_examples]): axs[0][example_i].imshow( np.reshape(test_xs[ind, :], (28, 28))) axs[0][example_i].set_axis_off() axs[1][example_i].imshow( np.reshape([recon_PCA[ind, :]], (28, 28))) axs[1][example_i].set_axis_off() axs[2][example_i].imshow( np.reshape([recon_SOM[ind, :]], (28, 28))) axs[2][example_i].set_axis_off() axs[3][example_i].imshow( np.reshape([recon_AE[ind, :]], (28, 28))) axs[3][example_i].set_axis_off() fig.show() plt.draw() # In[70]: # %% Basic test """Test the autoencoder using MNIST.""" import tensorflow as tf import tensorflow.examples.tutorials.mnist.input_data as input_data import matplotlib.pyplot as plt # %% # load MNIST as before mnist = input_data.read_data_sets('MNIST_data', one_hot=True) mean_img = np.mean(mnist.train.images, axis=0) ae2 = autoencoder(dimensions=[784, 256,120,80 ,64]) # ae = autoencoder(dimensions=[784,1000,500,250,2]) # %% learning_rate = 0.001 optimizer = tf.train.AdamOptimizer(learning_rate).minimize(ae2['cost']) # %% # We create a session to use the graph sess = tf.Session() sess.run(tf.initialize_all_variables()) # %% # Fit all training data batch_size = 50 n_epochs = 30 for epoch_i in range(n_epochs): for batch_i in range(mnist.train.num_examples // batch_size): batch_xs, _ = mnist.train.next_batch(batch_size) train = np.array([img - mean_img for img in batch_xs]) sess.run(optimizer, feed_dict={ae2['x']: train}) print(epoch_i, sess.run(ae2['cost'], feed_dict={ae2['x']: train})) # In[71]: lowdim_AE2 = sess.run(ae2['z'], feed_dict={ae2['x']: test_xs_norm}) recon_AE2 = sess.run(ae2['y'], feed_dict={ae2['x']: test_xs_norm})+ mean_img # In[72]: somMNIST2 = SOM.SOM('som1', lowdim_AE2, mapsize = [60, 60],norm_method = 'var',initmethod='pca') # som1 = SOM.SOM('som1', D, mapsize = [1, 100],norm_method = 'var',initmethod='pca') somMNIST2.train(n_job = 1, shared_memory = 'no',verbose='final') codebook_MNIST2 = somMNIST2.codebook[:] # In[73]: codebook_MNIST_n2 = SOM.denormalize_by(somMNIST2.data_raw, codebook_MNIST2, n_method = 'var') # In[74]: bmu_test2 = somMNIST2.project_data(lowdim_AE2) # In[75]: recon_SOM2 = codebook_MNIST_n2[bmu_test2] lowdim_SOM2 = somMNIST2.ind_to_xy(bmu_test2) # In[76]: fig = plt.figure() K = 9 c = np.argmax(test_xs_labels[ind_row_test],axis=1) plt.subplot(2,2,1) plt.scatter(lowdim_AE[ind_row_test,0],lowdim_AE[ind_row_test,1],s=20,edgecolor='None',marker='o',alpha=1.,c=plt.cm.RdYlBu_r(np.asarray(c)/float(K))); plt.subplot(2,2,2) plt.scatter(lowdim_PCA[ind_row_test,0],lowdim_PCA[ind_row_test,1],s=20,edgecolor='None',marker='o',alpha=1.,c=plt.cm.RdYlBu_r(np.asarray(c)/float(K))); plt.subplot(2,2,3) plt.scatter(lowdim_SOM[ind_row_test,0],lowdim_SOM[ind_row_test,1],s=20,edgecolor='None',marker='o',alpha=1.,c=plt.cm.RdYlBu_r(np.asarray(c)/float(K))); plt.subplot(2,2,4) plt.scatter(lowdim_SOM2[ind_row_test,0],lowdim_SOM2[ind_row_test,1],s=20,edgecolor='None',marker='o',alpha=1.,c=plt.cm.RdYlBu_r(np.asarray(c)/float(K))); fig.set_size_inches(7,7); # In[79]: fig, axs = plt.subplots(5, n_examples, figsize=(12, 4)) for example_i,ind in enumerate(ind_row_test[:n_examples]): axs[0][example_i].imshow( np.reshape(test_xs[ind, :], (28, 28))) axs[0][example_i].set_axis_off() axs[1][example_i].imshow( np.reshape([recon_PCA[ind, :]], (28, 28))) axs[1][example_i].set_axis_off() axs[2][example_i].imshow( np.reshape([recon_SOM[ind, :]], (28, 28))) axs[2][example_i].set_axis_off() axs[3][example_i].imshow( np.reshape([recon_AE[ind, :]], (28, 28))) axs[3][example_i].set_axis_off() axs[4][example_i].imshow( np.reshape([recon_AE2[ind, :]], (28, 28))) axs[4][example_i].set_axis_off() fig.show() # # Distributed representation: Local and non-local representation # # # # Local representation (usually in manifold learning) # # ![](http://www.cis.hut.fi/somtoolbox/theory/img1.gif) # # * ** classically, each ball is one complete prototypical result** # * ** it is very fast to learn and it is data efficient** # * ** it works well, if we now the features of the system** # # # # Distributed representation (Deep Learning) # ![](http://neuralnetworksanddeeplearning.com/images/tikz41.png) # # * ** each path of activations of balls along the network is one possible prototipical result** # * ** It is a kind of distributed media or distributed memory, where elements are contributing partially** # * ** it is slower and data hungry** # * ** but it learns new representation, while learning to perform other task (e.g. classifications)** # * ** by addding each layer the representational space expands exponentially** # # # With similar Idea one can play with the architucture of the network # # ### MLP for prediction # ![](Images/MLP1.png) # In[48]: ''' A Multilayer Perceptron implementation example using TensorFlow library. This example is using the MNIST database of handwritten digits (http://yann.lecun.com/exdb/mnist/) Author: Aymeric Damien Project: https://github.com/aymericdamien/TensorFlow-Examples/ ''' # Parameters learning_rate = 0.001 training_epochs = 20 batch_size = 100 display_step = 1 # Network Parameters # 784-500-500-2000-10 n_hidden_1 = 500 # 1st layer number of features n_hidden_2 = 500 # 2nd layer number of features n_hidden_3 = 2000 # 3nd layer number of features n_input = 784 # MNIST data input (img shape: 28*28) n_classes = 10 # MNIST total classes (0-9 digits) # tf Graph input x = tf.placeholder("float", [None, n_input]) y = tf.placeholder("float", [None, n_classes]) # Create model def multilayer_perceptron(x, weights, biases): # Hidden layer with RELU activation layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1']) layer_1 = tf.nn.relu(layer_1) # Hidden layer with RELU activation layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2']) layer_2 = tf.nn.relu(layer_2) # Hidden layer with RELU activation layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3']) layer_3 = tf.nn.relu(layer_3) # # Hidden layer with RELU activation # layer_4 = tf.add(tf.matmul(layer_3, weights['h4']), biases['b4']) # layer_4 = tf.nn.relu(layer_4) # Output layer with linear activation out_layer = tf.matmul(layer_3, weights['out']) + biases['out'] return out_layer # In[28]: # Store layers weight & bias weights = { 'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])), 'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])), 'h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])), # 'h4': tf.Variable(tf.random_normal([n_hidden_3, n_hidden_4])), 'out': tf.Variable(tf.random_normal([n_hidden_3, n_classes])) } biases = { 'b1': tf.Variable(tf.random_normal([n_hidden_1])), 'b2': tf.Variable(tf.random_normal([n_hidden_2])), 'b3': tf.Variable(tf.random_normal([n_hidden_3])), # 'b4': tf.Variable(tf.random_normal([n_hidden_4])), 'out': tf.Variable(tf.random_normal([n_classes])) } # Construct model pred = multilayer_perceptron(x, weights, biases) # Define loss and optimizer cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # Initializing the variables init = tf.initialize_all_variables() # In[29]: # Launch the graph with tf.Session() as sess: sess.run(init) # Training cycle for epoch in range(training_epochs): avg_cost = 0. total_batch = int(mnist.train.num_examples/batch_size) # Loop over all batches for i in range(total_batch): batch_x, batch_y = mnist.train.next_batch(batch_size) # Run optimization op (backprop) and cost op (to get loss value) _, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y}) # Compute average loss avg_cost += c / total_batch # Display logs per epoch step if epoch % display_step == 0: print 'Epoch: {} cost:{}'.format((epoch+1), avg_cost) print "Optimization Finished!" # Test model correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) # Calculate accuracy accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) print "Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}) # ## Nevertheless, this is not the benchmark result. There are some other tricks, which improves the results to above 99% # * **Droup out** # * **Convolutional Deep Networks** # * **...** # # Furhter examples # * https://github.com/aymericdamien/TensorFlow-Examples # # Tensorflow playground # * http://playground.tensorflow.org # # Architectural Diversity of Deep Network # # Plug and play Machine Learning # http://www.asimovinstitute.org/neural-network-zoo/ # # # ## Siamese network # [Chopra,Hadsell,LeCun](https://www.cs.nyu.edu/~sumit/research/assets/cvpr05.pdf) # ![](https://www.cs.nyu.edu/~sumit/research/files/page1_7.png) # # # # An interesting new application: # * Conceptually similar to Word2vec and learning contextual similarity # * https://www.cs.cornell.edu/~sbell/pdf/siggraph2015-bell-bala.pdf