#!/usr/bin/env python # coding: utf-8 # # TensorFlow__MLP_ladder-network_MNIST-labeled100__pollenjp # # このノートブックは [nbviewer](http://nbviewer.jupyter.org/) を介して読むことをおすすめします。 # ( I recommend you to see this notebook on [nbviewer](http://nbviewer.jupyter.org/) .) # # # - 論文 # - [Semi-Supervised Learning with Ladder Networks - Antti Rasmus, Harri Valpola, Mikko Honkala, Mathias Berglund, Tapani Raiko](https://arxiv.org/abs/1507.02672) # # - code # - [rinuboney/ladder](https://github.com/rinuboney/ladder) # - [tarvaina/tensorflow-ladder](https://github.com/tarvaina/tensorflow-ladder) # #

import os,sys
print(sys.version)
import re
from pathlib import Path
import math

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

import numpy as np
import sklearn
from sklearn import datasets
import tqdm
import csv
import pandas as pd

seed = None
np.random.seed(seed=seed)

print("numpy ver: {}".format(np.__version__))
print("scikit-learn ver: {}".format(sklearn.__version__))
print("pandas ver: {}".format(pd.__version__)) import tensorflow as tf
print("tensorflow ver: {}".format(tf.__version__))

print("tf.executing_eagerly(): {}".format(tf.executing_eagerly())) 1 # number of layers num_examples = 60000 num_epochs = 150 num_labeled = 100 starter_learning_rate = 0.02 decay_after = 15 # epoch after which to begin learning rate decay batch_size = 100 num_iter = (num_examples//batch_size) * num_epochs # number of loop iterations # In[10]: with tf.name_scope(name="PLACEHOLDER"): inputs = tf.placeholder(tf.float32, shape=(None, layer_sizes[0])) outputs = tf.placeholder(tf.float32) # ### Initialize Layer's and Batch Normalization's Weights # #### [DEF] b_i, w_i # In[11]: def b_i(inits, size, name): return tf.Variable(inits * tf.ones([size]), name=name) def w_i(shape, name): return tf.Variable(tf.random_normal(shape, name=name)) / math.sqrt(shape[0]) # In[12]: shapes = list(zip(list(layer_sizes)[:-1], list(layer_sizes[1:]))) weights = { 'W': [w_i(s, "W") for s in shapes], # Encoder weights 'V': [w_i(s[::-1], "V") for s in shapes], # Decoder weights # batch normalization parameter to shift the normalized value 'beta': [b_i(0.0, layer_sizes[l+1], "beta") for l in range(L)], # batch normalization parameter to scale the normalized value 'gamma': [b_i(1.0, layer_sizes[l+1], "beta") for l in range(L)], } # In[13]: noise_std = 0.3 # scaling factor for noise used in corrupted encoder # hyperparameters that denote the importance of each layer denoising_cost = [1000.0, 10.0, 0.10, 0.10, 0.10, 0.10, 0.10] # In[14]: join = lambda l, u: tf.concat([l, u], 0) labeled = lambda x : tf.slice(x, [0, 0], [batch_size, -1]) if x is not None else x unlabeled = lambda x : tf.slice(x, [batch_size, 0], [-1, -1]) if x is not None else x split_lu = lambda x : (labeled(x), unlabeled(x)) # In[15]: training = tf.placeholder(tf.bool) ewma = tf.train.ExponentialMovingAverage(decay=0.99) # to calculate the moving averages of mean and variance bn_assigns= [] # this list stores the updates to be made to average mean and variance # ### batch_normalization # #### [DEF] batch_normalization # In[16]: def batch_normalization(batch, mean=None, var=None): """ Parameters ---------- batch : mean : var : Returns ------- normalized batch : """ if mean is None or var is None: # まだ平均と分散を計算していない場合 mean, var = tf.nn.moments(batch, axes=[0]) return (batch - mean) / tf.sqrt(var + tf.constant(1e-10)) # In[17]: # average mean and variance of all layers running_mean = [tf.Variable(tf.constant(0.0, shape=[l]), trainable=False) for l in layer_sizes[1:]] running_var = [tf.Variable(tf.constant(1.0, shape=[l]), trainable=False) for l in layer_sizes[1:]] # #### [DEF] update_batch_normalization # In[18]: def update_batch_normalization(batch, l): """ batch normalize + update average mean and variance of layer l Parameters ---------- batch : l : layer Globals ------- running_mean, running_var : list, These list stores average mean and variance of all layers ewma : tf.train.ExponentialMovingAverage, Calculate the moving averages of mean and variance bn_assigns : list, This list stores the updates to be made to average mean and variance Returns ------- normalized batch """ mean, var = tf.nn.moments(batch, axes=[0]) assign_mean = running_mean[l-1].assign(mean) # Update assign_var = running_var[l-1].assign(var) # Update bn_assigns.append(ewma.apply([running_mean[l-1], running_var[l-1]])) # Store moving averages with tf.control_dependencies([assign_mean, assign_var]): # return after assign return (batch - mean) / tf.sqrt(var + 1e-10) # ### Encoder # In[19]: def encoder(inputs, noise_std): """ Parameters ---------- inputs : noised_std : float, noised_std != 0.0 --> Corrupted Encoder noised_std == 0.0 --> Clean Encoder Globals ------- split_lu : func layer_sizes : list weights : dict join : func batch_normalization : func running_mean, running_var : list, These list stores average mean and variance of all layers Returns ------- """ h = inputs + tf.random_normal(tf.shape(inputs)) * noise_std # add noise to input d = {} # to store the pre-activation, activation, mean and variance for each layer # The data for labeled and unlabeled examples are stored separately d['labeled'] = {'z': {}, 'm': {}, 'v': {}, 'h': {}} # m=mean, v=variance d['unlabeled'] = {'z': {}, 'm': {}, 'v': {}, 'h': {}} # m=mean, v=variance d['labeled']['z'][0], d['unlabeled']['z'][0] = split_lu(h) for l in range(1, L+1): print( "Layer {:>3}: {:>5} -> {:>5}".format(l,layer_sizes[l-1], layer_sizes[l]) ) d['labeled']['h'][l-1], d['unlabeled']['h'][l-1] = split_lu(h) z_pre = tf.matmul(h, weights['W'][l-1]) # pre-activation z_pre_l, z_pre_u = split_lu(z_pre) # split labeled and unlabeled examples m, v = tf.nn.moments(z_pre_u, axes=[0]) # compute mean, variance using twice later (efficiency) #---------------------------------------- # if training: def training_batch_norm(): # Training batch normalization # batch normalization for labeled and unlabeled examples is performed separately if noise_std > 0: # Corrupted Encoder # Corrupted encoder # batch normalization + noise z = join(batch_normalization(z_pre_l), batch_normalization(z_pre_u, m, v)) z += tf.random_normal(tf.shape(z_pre)) * noise_std else: # Clean Encoder # Clean encoder # batch normalization + update the average mean and variance using batch mean and variance of labeled examples z = join(update_batch_normalization(z_pre_l, l), batch_normalization(z_pre_u, m, v)) return z # else: def eval_batch_norm(): # Evaluation batch normalization # obtain average mean and variance and use it to normalize the batch mean, var = ewma.average(running_mean[l-1]), ewma.average(running_var[l-1]) z = batch_normalization(z_pre, mean, var) # Instead of the above statement, the use of the following 2 statements containing a typo # consistently produces a 0.2% higher accuracy for unclear reasons. # m_l, v_l = tf.nn.moments(z_pre_l, axes=[0]) # z = join(batch_normalization(z_pre_l, m_l, mean, var), batch_normalization(z_pre_u, mean, var)) return z # perform batch normalization according to value of boolean "training" placeholder: z = tf.cond(pred=training, true_fn=training_batch_norm, false_fn=eval_batch_norm) #---------------------------------------- if l == L: # use softmax activation in output layer h = tf.nn.softmax(weights['gamma'][l-1] * (z + weights["beta"][l-1])) else: # use ReLU activation in hidden layers h = tf.nn.relu(z + weights["beta"][l-1]) d['labeled']['z'][l] , d['unlabeled']['z'][l] = split_lu(z) d['unlabeled']['m'][l], d['unlabeled']['v'][l] = m, v # save mean and variance of unlabeled examples for decoding d['labeled']['h'][l], d['unlabeled']['h'][l] = split_lu(h) return h, d # #### Encode # In[20]: with tf.name_scope(name="Corrupted_Encoder"): print( "=== Corrupted Encoder ===") y_c, corr = encoder(inputs, noise_std) with tf.name_scope(name="Clean_Encoder"): print( "=== Clean Encoder ===" ) y, clean = encoder(inputs, 0.0) # 0.0 -> do not add noise # ### Decoder # #### [DEF] g_gauss # In[21]: def g_gauss(z_c, u, size): """ gaussian denoising function proposed in the original paper Parameters ---------- z_c : z in Corrupted Layer u : batch normalized h~(l) (l=0,...,L) size : Returns ------- """ w_i = lambda inits, name: tf.Variable(inits * tf.ones([size]), name=name) a1 = w_i(0., 'a1') a2 = w_i(1., 'a2') a3 = w_i(0., 'a3') a4 = w_i(0., 'a4') a5 = w_i(0., 'a5') a6 = w_i(0., 'a6') a7 = w_i(1., 'a7') a8 = w_i(0., 'a8') a9 = w_i(0., 'a9') a10 = w_i(0., 'a10') mu = a1 * tf.sigmoid(a2 * u + a3) + a4 * u + a5 v = a6 * tf.sigmoid(a7 * u + a8) + a9 * u + a10 z_est = (z_c - mu) * v + mu return z_est # #### Decode # In[22]: # Decoder print( "=== Decoder ===" ) with tf.name_scope(name="Decoder"): z_est = {} d_cost = [] # to store the denoising cost of all layers for l in range(L, -1, -1): print( "Layer {:>2}: {:>5} -> {:>5}, denoising cost: {:>7.1f}".format(l, layer_sizes[l+1] if l+1 < len(layer_sizes) else "None", layer_sizes[l], denoising_cost[l])) z, z_c = clean['unlabeled']['z'][l], corr['unlabeled']['z'][l] m, v = clean['unlabeled']['m'].get(l, 0), clean['unlabeled']['v'].get(l, 1-1e-10) if l == L: u = unlabeled(y_c) else: u = tf.matmul(z_est[l+1], weights['V'][l]) u = batch_normalization(u) z_est[l] = g_gauss(z_c, u, layer_sizes[l]) z_est_bn = (z_est[l] - m) / v # append the cost of this layer to d_cost d_cost.append((tf.reduce_mean(tf.reduce_sum(tf.square(z_est_bn - z), 1)) / layer_sizes[l]) * denoising_cost[l]) # ### parameter # #### Cost, Loss # In[23]: # calculate total unsupervised cost by adding the denoising cost of all layers with tf.name_scope(name="Cost"): u_cost = tf.add_n(d_cost) y_N = labeled(y_c) cost = -tf.reduce_mean(tf.reduce_sum(outputs*tf.log(y_N), 1)) # supervised cost loss = cost + u_cost # total cost # #### accuracy # In[24]: with tf.name_scope(name="pred_cost"): pred_cost = -tf.reduce_mean(tf.reduce_sum(outputs*tf.log(y), 1)) # cost used for prediction with tf.name_scope(name="accuracy"): correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(outputs, 1)) # no of correct predictions accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) * tf.constant(100.0) # #### optimizer # In[25]: with tf.name_scope(name="Optimizer"): learning_rate = tf.Variable(starter_learning_rate, trainable=False) train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) # In[26]: # add the updates of batch normalization statistics to train_step bn_updates = tf.group(*bn_assigns) with tf.control_dependencies([train_step]): train_step = tf.group(bn_updates) # ### Computational Graph # In[27]: show_computational_graph(tf.get_default_graph()) # ### Load MNIST # In[28]: print( "=== Loading Data ===" ) mnist = mnist_input_data.read_data_sets(train_dir=str(raw_Path / "MNIST_data"), n_labeled=num_labeled, fake_data=False, one_hot=True) # In[29]: saver = tf.train.Saver() # ### Session # In[30]: print( "=== Starting Session ===" ) sess = tf.Session() # In[31]: i_iter = 0 # #### chackpoints # In[32]: ckpt = tf.train.get_checkpoint_state('checkpoints/') # get latest checkpoint (if any) if ckpt and ckpt.model_checkpoint_path: # if checkpoint exists, restore the parameters and set epoch_n and i_iter saver.restore(sess, ckpt.model_checkpoint_path) epoch_n = int(ckpt.model_checkpoint_path.split('-')[1]) i_iter = (epoch_n+1) * (num_examples//batch_size) print( "Restored Epoch ", epoch_n ) else: # no checkpoint exists. create checkpoints directory if it does not exist. if not os.path.exists('checkpoints'): os.makedirs('checkpoints') init = tf.global_variables_initializer() sess.run(init) # In[33]: print( "=== Training ===" ) print( "Initial Accuracy: ", sess.run(accuracy, feed_dict={ inputs: mnist.semi_test.images, outputs: mnist.semi_test.labels, training: False}), "%" ) # #### TRAINING # In[34]: for i in tqdm.tqdm(range(i_iter, num_iter)): images, labels = mnist.semi_train.next_batch(batch_size) sess.run(train_step, feed_dict={inputs: images, outputs: labels, training: True}) if (i > 1) and ((i+1) % (num_iter//num_epochs) == 0): epoch_n = i//(num_examples//batch_size) if (epoch_n+1) >= decay_after: # decay learning rate # learning_rate = starter_learning_rate * ((num_epochs - for i in tqdm.tqdm(range(i_iter, num_iter)):
    images, labels = mnist.semi_train.next_batch(batch_size)
    sess.run(train_step, feed_dict={inputs: images, outputs: labels, training: True})
    if (i > 1) and ((i+1) % (num_iter//num_epochs) == 0):
        epoch_n = i//(num_examples//batch_size)
        if (epoch_n+1) >= decay_after:
            # decay learning rate
            # learning_rate = starter_learning_rate * ((num_epochs - epoch_n) / (num_epochs - decay_after))
            ratio = 1.0 * (num_epochs - (epoch_n+1))  # epoch_n + 1 because learning rate is set for next epoch
            ratio = max(0, ratio / (num_epochs - decay_after))
            sess.run(learning_rate.assign(starter_learning_rate * ratio))
        print( "iter {}: test_acc:{}%".format(i, sess.run(accuracy, feed_dict={
                                                                                    inputs: mnist.semi_test.images,
                                                                                    outputs: mnist.semi_test.labels,
                                                                                    training: False})
                                              )
             )

print( "Final Accuracy: {}".format(sess.run(accuracy, feed_dict={
                                                                    inputs: mnist.semi_test.images,
                                                                    outputs: mnist.semi_test.labels,
                                                                    training: False}), "%"
                                    )
     )
sess.close()

show_computational_graph(tf.get_default_graph())