#!/usr/bin/env python
# coding: utf-8

# # ADAGE: Pan-cancer gene expression
# 
# **Gregory Way 2017**
# 
# This script trains a denoising autoencoder for cancer gene expression data using Keras. It modifies the framework presented by the ADAGE (Analysis using denoising autoencoders of gene expression) model published by [Tan _et al_ 2015](https://doi.org/10.1128/mSystems.00025-15).
# 
# An ADAGE model learns a non-linear, reduced dimensional representation of gene expression data by bottlenecking raw features into a smaller set. The model is then trained by minimizing the information lost between input and reconstructed input.
# 
# The specific model trained in this notebook consists of gene expression input (5000 most variably expressed genes by median absolute deviation) compressed down into one length 100 vector. The hidden layer is then decoded back to the original 5000 dimensions. The encoding (compression) layer has a `relu` activation and the decoding layer has a `sigmoid` activation. The weights of each layer are glorot uniform initialized. We include an l1 regularization term (see [`keras.regularizers.l1`](https://keras.io/regularizers/) for more details) to induce sparsity in the model, as well as a term controlling the probability of input feature dropout. This is only active during training and is the denoising aspect of the model. See [`keras.layers.noise.Dropout`](https://keras.io/layers/core/) for more details.
# 
# We train the autoencoder with the Adadelta optimizer and MSE reconstruction loss.
# 
# The pan-cancer ADAGE model is similar to tybalt, but does not constrain the features to match a Gaussian distribution. It is an active research question if the VAE learned features provide any additional benefits over ADAGE features. The VAE is a generative model and therefore permits easy generation of fake data. Additionally, we hypothesize that the VAE learns a manifold that can be interpolated to extract meaningful biological knowledge.

# In[1]:


import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pydot
import graphviz
from keras.utils import plot_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

from keras.layers import Input, Dense, Dropout, Activation
from keras.layers.noise import GaussianDropout
from keras.models import Model
from keras.regularizers import l1
from keras import optimizers
import keras


# In[2]:


print(keras.__version__)


# In[3]:


get_ipython().run_line_magic('matplotlib', 'inline')
plt.style.use('seaborn-notebook')


# In[4]:


sns.set(style="white", color_codes=True)
sns.set_context("paper", rc={"font.size":14,"axes.titlesize":15,"axes.labelsize":20,
                             'xtick.labelsize':14, 'ytick.labelsize':14})


# In[5]:


# Load RNAseq data
rnaseq_file = os.path.join('data', 'pancan_scaled_zeroone_rnaseq.tsv.gz')
rnaseq_df = pd.read_table(rnaseq_file, index_col=0)
print(rnaseq_df.shape)
rnaseq_df.head(2)


# In[6]:


np.random.seed(123)


# In[7]:


# Split 10% test set randomly
test_set_percent = 0.1
rnaseq_test_df = rnaseq_df.sample(frac=test_set_percent)
rnaseq_train_df = rnaseq_df.drop(rnaseq_test_df.index)


# ## Parameter Sweep Results
# 
# We previously performed a parameter sweep search over a grid of potential hyperparameter values. Based on this sweep, we determined that the optimal ADAGE parameters are:
# 
# | Parameter | Optimal Setting |
# | :-------: | :-------------: |
# | Learning Rate | 1.1 |
# | Sparsity | 0 |
# | Noise | 0.05 |
# | Epochs | 100 |
# | Batch Size | 50 |

# In[8]:


num_features = rnaseq_df.shape[1]
encoding_dim = 100
sparsity = 0
noise = 0.05
epochs = 100
batch_size = 50
learning_rate = 1.1


# In[9]:


# Build the Keras graph
input_rnaseq = Input(shape=(num_features, ))
encoded_rnaseq = Dropout(noise)(input_rnaseq)
encoded_rnaseq_2 = Dense(encoding_dim,
                         activity_regularizer=l1(sparsity))(encoded_rnaseq)
activation = Activation('relu')(encoded_rnaseq_2)
decoded_rnaseq = Dense(num_features, activation='sigmoid')(activation)

autoencoder = Model(input_rnaseq, decoded_rnaseq)


# In[10]:


autoencoder.summary()


# In[11]:


# Visualize the connections of the custom VAE model
output_model_file = os.path.join('figures', 'adage_architecture.png')
plot_model(autoencoder, to_file=output_model_file)

SVG(model_to_dot(autoencoder).create(prog='dot', format='svg'))


# In[12]:


# Separate out the encoder and decoder model
encoder = Model(input_rnaseq, encoded_rnaseq_2)

encoded_input = Input(shape=(encoding_dim, ))
decoder_layer = autoencoder.layers[-1]
decoder = Model(encoded_input, decoder_layer(encoded_input))


# In[13]:


# Compile the autoencoder to prepare for training
adadelta = optimizers.Adadelta(lr=learning_rate)
autoencoder.compile(optimizer=adadelta, loss='mse')


# In[14]:


get_ipython().run_cell_magic('time', '', 'hist = autoencoder.fit(np.array(rnaseq_train_df), np.array(rnaseq_train_df),\n                       shuffle=True,\n                       epochs=epochs,\n                       batch_size=batch_size,\n                       validation_data=(np.array(rnaseq_test_df), np.array(rnaseq_test_df)))\n')


# In[15]:


# Visualize training performance
history_df = pd.DataFrame(hist.history)
hist_plot_file = os.path.join('figures', 'adage_training.png')
ax = history_df.plot()
ax.set_xlabel('Epochs')
ax.set_ylabel('ADAGE Reconstruction Loss')
fig = ax.get_figure()
fig.savefig(hist_plot_file)


# ### Save Model Outputs

# In[16]:


# Encode rnaseq into the hidden/latent representation - and save output
encoded_samples = encoder.predict(np.array(rnaseq_df))
encoded_rnaseq_df = pd.DataFrame(encoded_samples, index=rnaseq_df.index)

encoded_rnaseq_df.columns.name = 'sample_id'
encoded_rnaseq_df.columns = encoded_rnaseq_df.columns + 1
encoded_file = os.path.join('data', 'encoded_adage_features.tsv')
encoded_rnaseq_df.to_csv(encoded_file, sep='\t')


# In[17]:


# Output weight matrix of gene contributions per node
weight_file = os.path.join('results', 'adage_gene_weights.tsv')

weight_matrix = pd.DataFrame(autoencoder.get_weights()[0], index=rnaseq_df.columns,
                             columns=range(1, 101)).T
weight_matrix.index.name = 'encodings'
weight_matrix.to_csv(weight_file, sep='\t')


# ## Observe reconstruction

# In[18]:


# Reconstruct input RNAseq
decoded_samples = decoder.predict(encoded_samples)

reconstructed_df = pd.DataFrame(decoded_samples, index=rnaseq_df.index,
                                columns=rnaseq_df.columns)


# In[19]:


reconstruction_fidelity = rnaseq_df - reconstructed_df

gene_mean = reconstruction_fidelity.mean(axis=0)
gene_abssum = reconstruction_fidelity.abs().sum(axis=0).divide(rnaseq_df.shape[0])
gene_summary = pd.DataFrame([gene_mean, gene_abssum], index=['gene mean', 'gene abs(sum)']).T
gene_summary.sort_values(by='gene abs(sum)', ascending=False).head()


# In[20]:


# Mean of gene reconstruction vs. absolute reconstructed difference per sample
reconstruct_fig_file = os.path.join('figures', 'adage_gene_reconstruction.png')
g = sns.jointplot('gene mean', 'gene abs(sum)', data=gene_summary, stat_func=None);
g.savefig(reconstruct_fig_file)


# In[21]:


# What are the most and least activated nodes
sum_node_activity = encoded_rnaseq_df.sum(axis=0).sort_values(ascending=False)

# Top 5 most active nodes
print(sum_node_activity.head(5))

# Bottom 10 least active nodes
sum_node_activity.tail(5)


# In[22]:


# Histogram of node activity for all 100 latent features
sum_node_activity.hist()
plt.xlabel('Activation Sum')
plt.ylabel('Count');


# In[23]:


# Histogram of sample activity for all 10,000 samples
encoded_rnaseq_df.sum(axis=1).hist()
plt.xlabel('Sample Total Activation')
plt.ylabel('Count');


# In[24]:


# Example of node activation distribution for the first two latent features
plt.figure(figsize=(6, 6))
plt.scatter(encoded_rnaseq_df.iloc[:, 1], encoded_rnaseq_df.iloc[:, 2])
plt.xlabel('Latent Feature 1')
plt.ylabel('Latent Feature 2');