#!/usr/bin/env python # coding: utf-8 # In[45]: """ Imports """ import numpy as np import json import pandas as pd import tensorflow as tf get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import time import os import urllib.request from tensorflow.models.rnn.ptb import reader os.chdir("C:/Users/macle/Desktop/UPC Masters/Semester 2/CI/Final Project") # In[46]: with open('reddit_data.json','r') as data_file: reddit_data = json.load(data_file) # In[47]: df = pd.DataFrame(reddit_data,columns=['user','subreddit','utc_stamp']) df['utc_stamp'] = pd.to_datetime(df['utc_stamp'],unit='s') sub_list = list(df.groupby('subreddit')['subreddit'].nunique().keys()) df.sort_values(by=['user','utc_stamp'], ascending=True, inplace=True) sub_seqs = [] current_sub = '' for rw in df.iterrows(): sub = rw[1]['subreddit'] if sub != current_sub: sub_seqs.append(sub) current_sub = sub # In[48]: """ Load and process data, utility functions """ raw_data = sub_seqs vocab = set(raw_data) vocab_size = len(vocab) idx_to_vocab = dict(enumerate(vocab)) vocab_to_idx = dict(zip(idx_to_vocab.values(), idx_to_vocab.keys())) data = [vocab_to_idx[c] for c in raw_data] del raw_data def gen_epochs(n, num_steps, batch_size): for i in range(n): yield reader.ptb_iterator(data, batch_size, num_steps) def reset_graph(): if 'sess' in globals() and sess: sess.close() tf.reset_default_graph() def train_network(g, num_epochs, num_steps = 200, batch_size = 32, verbose = True, save=False): tf.set_random_seed(2345) with tf.Session() as sess: print(sess.run(tf.global_variables_initializer())) training_losses = [] for idx, epoch in enumerate(gen_epochs(num_epochs, num_steps, batch_size)): training_loss = 0 steps = 0 training_state = None for X, Y in epoch: steps += 1 feed_dict={g['x']: X, g['y']: Y} if training_state is not None: feed_dict[g['init_state']] = training_state training_loss_, training_state, _ = sess.run([g['total_loss'], g['final_state'], g['train_step']], feed_dict) training_loss += training_loss_ if verbose: print("Average training loss for Epoch", idx, ":", training_loss/steps) training_losses.append(training_loss/steps) if isinstance(save, str): g['saver'].save(sess, save) return training_losses # In[49]: class GRUCell(tf.nn.rnn_cell.RNNCell): """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).""" def __init__(self, num_units): self._num_units = num_units @property def state_size(self): return self._num_units @property def output_size(self): return self._num_units def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. ru = tf.nn.rnn_cell._linear([inputs, state], 2 * self._num_units, True, 1.0) ru = tf.nn.sigmoid(ru) r, u = tf.split(1, 2, ru) with tf.variable_scope("Candidate"): c = tf.nn.tanh(tf.nn.rnn_cell._linear([inputs, r * state], self._num_units, True)) new_h = u * state + (1 - u) * c return new_h, new_h # In[50]: class CustomCell(tf.nn.rnn_cell.RNNCell): """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).""" def __init__(self, num_units, num_weights): self._num_units = num_units self._num_weights = num_weights @property def state_size(self): return self._num_units @property def output_size(self): return self._num_units def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. ru = tf.nn.rnn_cell._linear([inputs, state], 2 * self._num_units, True, 1.0) ru = tf.nn.sigmoid(ru) r, u = tf.split(1, 2, ru) with tf.variable_scope("Candidate"): lambdas = tf.nn.rnn_cell._linear([inputs, state], self._num_weights, True) lambdas = tf.split(1, self._num_weights, tf.nn.softmax(lambdas)) Ws = tf.get_variable("Ws", shape = [self._num_weights, inputs.get_shape()[1], self._num_units]) Ws = [tf.squeeze(i) for i in tf.split(0, self._num_weights, Ws)] candidate_inputs = [] for idx, W in enumerate(Ws): candidate_inputs.append(tf.matmul(inputs, W) * lambdas[idx]) Wx = tf.add_n(candidate_inputs) c = tf.nn.tanh(Wx + tf.nn.rnn_cell._linear([r * state], self._num_units, True, scope="second")) new_h = u * state + (1 - u) * c return new_h, new_h # In[51]: def ln(tensor, scope = None, epsilon = 1e-5): """ Layer normalizes a 2D tensor along its second axis """ assert(len(tensor.get_shape()) == 2) m, v = tf.nn.moments(tensor, [1], keep_dims=True) if not isinstance(scope, str): scope = '' with tf.variable_scope(scope + 'layer_norm'): scale = tf.get_variable('scale', shape=[tensor.get_shape()[1]], initializer=tf.constant_initializer(1)) shift = tf.get_variable('shift', shape=[tensor.get_shape()[1]], initializer=tf.constant_initializer(0)) LN_initial = (tensor - m) / tf.sqrt(v + epsilon) return LN_initial * scale + shift # In[52]: class LayerNormalizedLSTMCell(tf.nn.rnn_cell.RNNCell): """ Adapted from TF's BasicLSTMCell to use Layer Normalization. Note that state_is_tuple is always True. """ def __init__(self, num_units, forget_bias=1.0, activation=tf.nn.tanh): self._num_units = num_units self._forget_bias = forget_bias self._activation = activation @property def state_size(self): return tf.nn.rnn_cell.LSTMStateTuple(self._num_units, self._num_units) @property def output_size(self): return self._num_units def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM).""" with tf.variable_scope(scope or type(self).__name__): c, h = state # change bias argument to False since LN will add bias via shift concat = tf.nn.rnn_cell._linear([inputs, h], 4 * self._num_units, False) i, j, f, o = tf.split(1, 4, concat) # add layer normalization to each gate i = ln(i, scope = 'i/') j = ln(j, scope = 'j/') f = ln(f, scope = 'f/') o = ln(o, scope = 'o/') new_c = (c * tf.nn.sigmoid(f + self._forget_bias) + tf.nn.sigmoid(i) * self._activation(j)) # add layer_normalization in calculation of new hidden state new_h = self._activation(ln(new_c, scope = 'new_h/')) * tf.nn.sigmoid(o) new_state = tf.nn.rnn_cell.LSTMStateTuple(new_c, new_h) return new_h, new_state # In[53]: def build_graph( cell_type = None, num_weights_for_custom_cell = 5, state_size = 100, num_classes = vocab_size, batch_size = 32, num_steps = 200, num_layers = 3, build_with_dropout=False, learning_rate = 1e-4): reset_graph() x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder') y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder') dropout = tf.constant(1.0) embeddings = tf.get_variable('embedding_matrix', [num_classes, state_size]) rnn_inputs = tf.nn.embedding_lookup(embeddings, x) if cell_type == 'Custom': cell = CustomCell(state_size, num_weights_for_custom_cell) elif cell_type == 'GRU': cell = tf.nn.rnn_cell.GRUCell(state_size) elif cell_type == 'LSTM': cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True) elif cell_type == 'LN_LSTM': cell = LayerNormalizedLSTMCell(state_size) else: cell = tf.nn.rnn_cell.BasicRNNCell(state_size) if build_with_dropout: cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=dropout) if cell_type == 'LSTM' or cell_type == 'LN_LSTM': cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True) else: cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers) if build_with_dropout: cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=dropout) init_state = cell.zero_state(batch_size, tf.float32) rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_state) with tf.variable_scope('softmax'): W = tf.get_variable('W', [state_size, num_classes]) b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0)) #reshape rnn_outputs and y rnn_outputs = tf.reshape(rnn_outputs, [-1, state_size]) y_reshaped = tf.reshape(y, [-1]) logits = tf.matmul(rnn_outputs, W) + b predictions = tf.nn.softmax(logits) total_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y_reshaped)) train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss) return dict( x = x, y = y, init_state = init_state, final_state = final_state, total_loss = total_loss, train_step = train_step, preds = predictions, saver = tf.train.Saver(), inputs = rnn_inputs ) # In[58]: g = build_graph(cell_type='LN_LSTM', num_steps=80) t = time.time() losses = train_network(g, 10, num_steps=80, save="tf_saves/LN_LSTM_1_epochs") print("It took", time.time() - t, "seconds to train for 10 epochs.") print("The average loss on the final epoch was:", losses[-1]) # In[59]: def generate_characters(g, checkpoint, num_chars,state=None, prompt=['2007scape'], pick_top_chars=None): """ Accepts a current character, initial state""" with tf.Session() as sess: print(sess.run(tf.global_variables_initializer())) g['saver'].restore(sess, checkpoint) chars = [] for seed_letter in prompt: current_char = vocab_to_idx[seed_letter] chars.append(idx_to_vocab[current_char]) if state is not None: feed_dict={g['x']: [[current_char]], g['init_state']: state} else: feed_dict={g['x']: [[current_char]]} for i in range(num_chars): if state is not None: feed_dict={g['x']: [[current_char]], g['init_state']: state} else: feed_dict={g['x']: [[current_char]]} preds, state = sess.run([g['preds'],g['final_state']], feed_dict) if pick_top_chars is not None: p = np.squeeze(preds) p[np.argsort(p)[:-pick_top_chars]] = 0 p = p / np.sum(p) current_char = np.random.choice(vocab_size, 1, p=p)[0] else: current_char = np.random.choice(vocab_size, 1, p=np.squeeze(preds))[0] chars.append(idx_to_vocab[current_char]) return chars # In[63]: with open('test_reddit_data.json','r') as data_file: test_reddit_data = json.load(data_file) # In[77]: df = pd.DataFrame(test_reddit_data,columns=['user','subreddit','submission','utc_stamp','rnd_words']) df['utc_stamp'] = pd.to_datetime(df['utc_stamp'],unit='s') df.sort_values(by=['user','utc_stamp'], ascending=True, inplace=True) sub_seqs = [] current_user = '' current_sub = '' for rw in df.iterrows(): sub = rw[1]['subreddit'] if sub in vocab: if current_user == '': usr_seq = [sub] elif rw[1]['user'] != current_user: sub_seqs.append(usr_seq) usr_seq = [sub] elif sub != current_sub: usr_seq.append(sub) current_user = rw[1]['user'] current_sub = sub # In[78]: sub_seqs[1] # In[87]: g = build_graph(cell_type='LN_LSTM', num_steps=1, batch_size=1) results = generate_characters(g, "tf_saves/LN_LSTM_1_epochs", 20, prompt=sub_seqs[1]) # In[88]: list(set(sub_seqs[1]) & set(results[-20:]))